cache.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. """Cache validation for ``*.mineru_raw/`` bundles.
  2. Validation policy (settled in design discussion; see
  3. ``LightRAGSidecarFormat-zh.md`` related notes):
  4. 1. ``_manifest.json`` exists, parses, ``version=1.0`` ∧ ``engine=mineru``.
  5. 2. **Source size fast-path**: ``source_file.stat().st_size`` matches manifest;
  6. mismatch → miss without hashing.
  7. 3. **Source content_hash**: full sha256 of the current source file matches
  8. manifest. The size+hash pair is computed by a single-read helper so the
  9. stored manifest is internally self-consistent.
  10. 4. **API mode**: if the manifest recorded ``api_mode`` and it differs from
  11. current ``MINERU_API_MODE``, miss.
  12. 5. **Parser options**: the manifest must record an ``options_signature`` that
  13. matches the current effective MinerU request options. Missing signatures
  14. from older manifests are treated as stale.
  15. 6. **Engine version**: if ``MINERU_ENGINE_VERSION`` is set and the manifest
  16. recorded a non-empty one, they must match.
  17. 7. **Endpoint signature**: if the active MinerU endpoint is set and the
  18. manifest recorded a non-empty one, they must match.
  19. 8. **Critical file**: ``content_list.json`` must exist with matching size
  20. **and** sha256 — sha256 here is the final tie-breaker against silent
  21. corruption affecting the file the adapter depends on.
  22. 9. **Other files**: size-only verification (cheap; covers most corruption
  23. modes for image / middle.json / layout.pdf).
  24. Any failed step ⇒ cache miss; the caller wipes the directory contents
  25. (preserving the directory itself) and re-runs the download.
  26. """
  27. from __future__ import annotations
  28. import hashlib
  29. import json
  30. import os
  31. from dataclasses import asdict, dataclass
  32. from pathlib import Path
  33. from typing import Any
  34. from lightrag.constants import MINERU_RAW_DIR_SUFFIX, PARSED_DIR_SUFFIX
  35. from lightrag.parser.external.mineru.manifest import load_manifest
  36. from lightrag.utils import logger
  37. DEFAULT_MINERU_API_MODE = "local"
  38. DEFAULT_MINERU_OFFICIAL_ENDPOINT = "https://mineru.net"
  39. DEFAULT_MINERU_MODEL_VERSION = "vlm"
  40. DEFAULT_MINERU_LANGUAGE = "ch"
  41. DEFAULT_MINERU_LOCAL_BACKEND = "hybrid-auto-engine"
  42. DEFAULT_MINERU_LOCAL_PARSE_METHOD = "auto"
  43. DEFAULT_MINERU_LOCAL_IMAGE_ANALYSIS = True
  44. DEFAULT_MINERU_LOCAL_START_PAGE_ID = 0
  45. DEFAULT_MINERU_LOCAL_END_PAGE_ID = 99999
  46. DEFAULT_MINERU_ENABLE_TABLE = True
  47. DEFAULT_MINERU_ENABLE_FORMULA = True
  48. DEFAULT_MINERU_IS_OCR = False
  49. def raw_dir_for_parsed_dir(parsed_dir: Path) -> Path:
  50. """Sibling raw dir for a given ``*.parsed`` dir.
  51. ``foo.parsed/`` → ``foo.mineru_raw/``. Used both at download time and at
  52. cache check time so the layout is canonical.
  53. """
  54. stem = parsed_dir.name
  55. if stem.endswith(PARSED_DIR_SUFFIX):
  56. stem = stem[: -len(PARSED_DIR_SUFFIX)]
  57. return parsed_dir.parent / f"{stem}{MINERU_RAW_DIR_SUFFIX}"
  58. def clear_dir_contents(directory: Path) -> None:
  59. """Delete everything inside ``directory`` but keep ``directory`` itself."""
  60. if not directory.exists():
  61. return
  62. for entry in directory.iterdir():
  63. try:
  64. if entry.is_dir() and not entry.is_symlink():
  65. _rmtree_safe(entry)
  66. else:
  67. entry.unlink()
  68. except OSError:
  69. # Best-effort cleanup; subsequent download will overwrite.
  70. continue
  71. def _rmtree_safe(directory: Path) -> None:
  72. import shutil
  73. shutil.rmtree(directory, ignore_errors=True)
  74. def compute_size_and_hash(path: Path) -> tuple[int, str]:
  75. """Single-read computation of ``(size_bytes, "sha256:<hex>")``.
  76. Manifest writes use this so the recorded size and hash are guaranteed to
  77. describe the same byte stream; using two ``open()`` calls would risk a
  78. TOCTOU mismatch if the file changed in between.
  79. """
  80. h = hashlib.sha256()
  81. size = 0
  82. with path.open("rb") as f:
  83. for chunk in iter(lambda: f.read(1 << 20), b""):
  84. h.update(chunk)
  85. size += len(chunk)
  86. return size, f"sha256:{h.hexdigest()}"
  87. def _current_api_mode() -> str:
  88. mode = _normalize_api_mode(os.getenv("MINERU_API_MODE", DEFAULT_MINERU_API_MODE))
  89. return mode
  90. def _normalize_api_mode(mode: str) -> str:
  91. mode = str(mode or "").strip().lower()
  92. return mode if mode in {"official", "local"} else DEFAULT_MINERU_API_MODE
  93. def _env_bool(name: str, default: bool) -> bool:
  94. raw = os.getenv(name, "").strip().lower()
  95. if raw in {"1", "true", "yes", "on"}:
  96. return True
  97. if raw in {"0", "false", "no", "off"}:
  98. return False
  99. return default
  100. def _env_int(name: str, default: int) -> int:
  101. raw = os.getenv(name, "").strip()
  102. if not raw:
  103. return default
  104. try:
  105. return int(raw)
  106. except ValueError:
  107. logger.warning(
  108. "[mineru_raw] %s=%r is not an integer; using %s", name, raw, default
  109. )
  110. return default
  111. def _current_endpoint_signature() -> str:
  112. mode = _current_api_mode()
  113. if mode == "official":
  114. return (
  115. os.getenv("MINERU_OFFICIAL_ENDPOINT", DEFAULT_MINERU_OFFICIAL_ENDPOINT)
  116. .strip()
  117. .rstrip("/")
  118. )
  119. if mode == "local":
  120. return os.getenv("MINERU_LOCAL_ENDPOINT", "").strip().rstrip("/")
  121. return ""
  122. def local_page_bounds(page_ranges: str) -> tuple[int, int]:
  123. raw = page_ranges.strip()
  124. if not raw:
  125. return DEFAULT_MINERU_LOCAL_START_PAGE_ID, DEFAULT_MINERU_LOCAL_END_PAGE_ID
  126. if "," in raw:
  127. raise ValueError(
  128. "MINERU_PAGE_RANGES with MINERU_API_MODE=local supports only a "
  129. "single page or simple range such as '1-10'"
  130. )
  131. if raw.isdigit():
  132. page = max(int(raw), 1)
  133. return page - 1, page - 1
  134. if "-" in raw:
  135. left, _, right = raw.partition("-")
  136. if left.isdigit() and right.isdigit():
  137. start = max(int(left), 1)
  138. end = max(int(right), start)
  139. return start - 1, end - 1
  140. raise ValueError(
  141. "MINERU_PAGE_RANGES with MINERU_API_MODE=local must be a single "
  142. "positive page number or simple range such as '1-10'"
  143. )
  144. @dataclass(frozen=True)
  145. class MinerUParserOptions:
  146. """Effective MinerU parser options used both for live requests and the
  147. cache signature.
  148. Constructed once via :meth:`from_env` so the client and the cache
  149. validator agree on every defaulting / normalization rule.
  150. """
  151. api_mode: str
  152. model_version: str
  153. language: str
  154. enable_table: bool
  155. enable_formula: bool
  156. is_ocr: bool
  157. page_ranges: str
  158. local_backend: str
  159. local_parse_method: str
  160. local_image_analysis: bool
  161. local_start_page_id: int
  162. local_end_page_id: int
  163. @classmethod
  164. def from_env(cls, *, api_mode: str | None = None) -> "MinerUParserOptions":
  165. mode = (
  166. _normalize_api_mode(api_mode)
  167. if api_mode is not None
  168. else _current_api_mode()
  169. )
  170. page_ranges = os.getenv("MINERU_PAGE_RANGES", "").strip()
  171. local_start = _env_int(
  172. "MINERU_LOCAL_START_PAGE_ID", DEFAULT_MINERU_LOCAL_START_PAGE_ID
  173. )
  174. local_end = _env_int(
  175. "MINERU_LOCAL_END_PAGE_ID", DEFAULT_MINERU_LOCAL_END_PAGE_ID
  176. )
  177. if mode == "local" and page_ranges:
  178. local_start, local_end = local_page_bounds(page_ranges)
  179. return cls(
  180. api_mode=mode,
  181. model_version=(
  182. os.getenv("MINERU_MODEL_VERSION", DEFAULT_MINERU_MODEL_VERSION).strip()
  183. or DEFAULT_MINERU_MODEL_VERSION
  184. ),
  185. language=(
  186. os.getenv("MINERU_LANGUAGE", DEFAULT_MINERU_LANGUAGE).strip()
  187. or DEFAULT_MINERU_LANGUAGE
  188. ),
  189. enable_table=_env_bool("MINERU_ENABLE_TABLE", DEFAULT_MINERU_ENABLE_TABLE),
  190. enable_formula=_env_bool(
  191. "MINERU_ENABLE_FORMULA", DEFAULT_MINERU_ENABLE_FORMULA
  192. ),
  193. is_ocr=_env_bool("MINERU_IS_OCR", DEFAULT_MINERU_IS_OCR),
  194. page_ranges=page_ranges,
  195. local_backend=(
  196. os.getenv("MINERU_LOCAL_BACKEND", DEFAULT_MINERU_LOCAL_BACKEND).strip()
  197. or DEFAULT_MINERU_LOCAL_BACKEND
  198. ),
  199. local_parse_method=(
  200. os.getenv(
  201. "MINERU_LOCAL_PARSE_METHOD", DEFAULT_MINERU_LOCAL_PARSE_METHOD
  202. ).strip()
  203. or DEFAULT_MINERU_LOCAL_PARSE_METHOD
  204. ),
  205. local_image_analysis=_env_bool(
  206. "MINERU_LOCAL_IMAGE_ANALYSIS", DEFAULT_MINERU_LOCAL_IMAGE_ANALYSIS
  207. ),
  208. local_start_page_id=local_start,
  209. local_end_page_id=local_end,
  210. )
  211. def signature(self) -> str:
  212. return mineru_options_signature(**asdict(self))
  213. def mineru_options_signature(
  214. *,
  215. api_mode: str,
  216. model_version: str = DEFAULT_MINERU_MODEL_VERSION,
  217. language: str = DEFAULT_MINERU_LANGUAGE,
  218. enable_table: bool = DEFAULT_MINERU_ENABLE_TABLE,
  219. enable_formula: bool = DEFAULT_MINERU_ENABLE_FORMULA,
  220. is_ocr: bool = DEFAULT_MINERU_IS_OCR,
  221. page_ranges: str = "",
  222. local_backend: str = DEFAULT_MINERU_LOCAL_BACKEND,
  223. local_parse_method: str = DEFAULT_MINERU_LOCAL_PARSE_METHOD,
  224. local_image_analysis: bool = DEFAULT_MINERU_LOCAL_IMAGE_ANALYSIS,
  225. local_start_page_id: int = DEFAULT_MINERU_LOCAL_START_PAGE_ID,
  226. local_end_page_id: int = DEFAULT_MINERU_LOCAL_END_PAGE_ID,
  227. ) -> str:
  228. mode = _normalize_api_mode(api_mode)
  229. payload: dict[str, Any] = {
  230. "signature_version": 1,
  231. "api_mode": mode,
  232. "language": str(language or "").strip() or DEFAULT_MINERU_LANGUAGE,
  233. "enable_table": bool(enable_table),
  234. "enable_formula": bool(enable_formula),
  235. }
  236. if mode == "official":
  237. payload.update(
  238. {
  239. "model_version": str(model_version or "").strip()
  240. or DEFAULT_MINERU_MODEL_VERSION,
  241. "is_ocr": bool(is_ocr),
  242. "page_ranges": str(page_ranges or "").strip(),
  243. }
  244. )
  245. else:
  246. payload.update(
  247. {
  248. "local_backend": str(local_backend or "").strip()
  249. or DEFAULT_MINERU_LOCAL_BACKEND,
  250. "local_parse_method": str(local_parse_method or "").strip()
  251. or DEFAULT_MINERU_LOCAL_PARSE_METHOD,
  252. "local_image_analysis": bool(local_image_analysis),
  253. "local_start_page_id": int(local_start_page_id),
  254. "local_end_page_id": int(local_end_page_id),
  255. }
  256. )
  257. raw = json.dumps(payload, sort_keys=True, separators=(",", ":"))
  258. return "sha256:" + hashlib.sha256(raw.encode("utf-8")).hexdigest()
  259. def current_mineru_options_signature() -> str:
  260. return MinerUParserOptions.from_env().signature()
  261. def is_bundle_valid(raw_dir: Path, source_file: Path) -> bool:
  262. """Return True iff the bundle is intact and matches the current source.
  263. See module docstring for the full policy. Returns False on any of:
  264. missing manifest, malformed manifest, schema version mismatch, source
  265. size/hash mismatch, parser options mismatch, engine/endpoint env mismatch,
  266. critical file missing or corrupted, or any non-critical file size mismatch.
  267. """
  268. if not raw_dir.is_dir():
  269. return False
  270. manifest = load_manifest(raw_dir)
  271. if manifest is None:
  272. return False
  273. # 1. Source size fast-path
  274. try:
  275. cur_size = source_file.stat().st_size
  276. except OSError:
  277. return False
  278. if cur_size != int(manifest.source_size_bytes):
  279. return False
  280. # 2. Source content_hash
  281. _, cur_hash = compute_size_and_hash(source_file)
  282. if cur_hash != manifest.source_content_hash:
  283. return False
  284. # 3. API mode (only when manifest had one; old manifests remain compatible)
  285. cur_api_mode = _current_api_mode()
  286. if manifest.api_mode and cur_api_mode != manifest.api_mode:
  287. return False
  288. # 4. Parser options. Old manifests did not record this and must miss so
  289. # changes such as MINERU_LOCAL_BACKEND cannot silently reuse stale output.
  290. if not manifest.options_signature:
  291. return False
  292. if current_mineru_options_signature() != manifest.options_signature:
  293. return False
  294. # 5. Engine version (only when current env exposes one AND manifest had one)
  295. cur_engine_version = os.getenv("MINERU_ENGINE_VERSION", "").strip()
  296. if (
  297. cur_engine_version
  298. and manifest.engine_version
  299. and cur_engine_version != manifest.engine_version
  300. ):
  301. return False
  302. # 6. Endpoint signature
  303. cur_endpoint = _current_endpoint_signature()
  304. if (
  305. cur_endpoint
  306. and manifest.endpoint_signature
  307. and cur_endpoint != manifest.endpoint_signature
  308. ):
  309. return False
  310. # 7. Critical file: size + sha256
  311. crit = manifest.critical_file
  312. crit_path = raw_dir / crit.path
  313. try:
  314. if crit_path.stat().st_size != int(crit.size):
  315. return False
  316. except OSError:
  317. return False
  318. if crit.sha256:
  319. _, crit_actual = compute_size_and_hash(crit_path)
  320. if crit_actual != crit.sha256:
  321. return False
  322. # 8. Other files: size only
  323. for entry in manifest.files:
  324. ep = raw_dir / entry.path
  325. try:
  326. if ep.stat().st_size != int(entry.size):
  327. return False
  328. except OSError:
  329. return False
  330. return True
  331. __all__ = [
  332. "MINERU_RAW_DIR_SUFFIX",
  333. "MinerUParserOptions",
  334. "clear_dir_contents",
  335. "compute_size_and_hash",
  336. "current_mineru_options_signature",
  337. "is_bundle_valid",
  338. "local_page_bounds",
  339. "mineru_options_signature",
  340. "raw_dir_for_parsed_dir",
  341. ]