test_cache.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461
  1. """``*.mineru_raw/`` cache validation tests.
  2. Covers every failure mode that triggers a re-download:
  3. - missing / malformed manifest
  4. - source file size mismatch (fast-path)
  5. - source file content_hash mismatch
  6. - parser options signature missing / mismatch
  7. - engine version / endpoint env mismatch
  8. - critical_file (content_list.json) size or sha256 mismatch
  9. - any non-critical file size mismatch
  10. """
  11. from __future__ import annotations
  12. import json
  13. from pathlib import Path
  14. import pytest
  15. from lightrag.parser.external.mineru import (
  16. Manifest,
  17. ManifestFile,
  18. clear_dir_contents,
  19. compute_size_and_hash,
  20. is_bundle_valid,
  21. raw_dir_for_parsed_dir,
  22. )
  23. from lightrag.parser.external.mineru.cache import current_mineru_options_signature
  24. from lightrag.parser.external.mineru.manifest import write_manifest
  25. # ---------------------------------------------------------------------------
  26. # Fixtures
  27. # ---------------------------------------------------------------------------
  28. @pytest.fixture
  29. def source_file(tmp_path: Path) -> Path:
  30. p = tmp_path / "src.pdf"
  31. p.write_bytes(b"Hello PDF" * 100)
  32. return p
  33. @pytest.fixture
  34. def fresh_bundle(tmp_path: Path, source_file: Path) -> tuple[Path, Manifest]:
  35. """Build a fully-valid bundle alongside ``source_file`` and return
  36. ``(raw_dir, manifest)``."""
  37. raw = tmp_path / "src.mineru_raw"
  38. raw.mkdir()
  39. content_list = raw / "content_list.json"
  40. content_list.write_text('[{"type":"text","text":"hi"}]', encoding="utf-8")
  41. images = raw / "images"
  42. images.mkdir()
  43. (images / "img1.png").write_bytes(b"PNG" * 50)
  44. (images / "img2.png").write_bytes(b"PNG" * 60)
  45. src_size, src_hash = compute_size_and_hash(source_file)
  46. crit_size, crit_hash = compute_size_and_hash(content_list)
  47. files = [
  48. ManifestFile(path="images/img1.png", size=(images / "img1.png").stat().st_size),
  49. ManifestFile(path="images/img2.png", size=(images / "img2.png").stat().st_size),
  50. ]
  51. manifest = Manifest(
  52. source_content_hash=src_hash,
  53. source_size_bytes=src_size,
  54. source_filename_at_parse=source_file.name,
  55. critical_file=ManifestFile(
  56. path="content_list.json", size=crit_size, sha256=crit_hash
  57. ),
  58. files=files,
  59. total_size_bytes=crit_size + sum(f.size for f in files),
  60. task_id="task-1",
  61. api_mode="local",
  62. options_signature=current_mineru_options_signature(),
  63. )
  64. write_manifest(raw, manifest)
  65. return raw, manifest
  66. # ---------------------------------------------------------------------------
  67. # Layout helpers
  68. # ---------------------------------------------------------------------------
  69. @pytest.mark.offline
  70. def test_raw_dir_naming(tmp_path: Path) -> None:
  71. parsed = tmp_path / "report.pdf.parsed"
  72. raw = raw_dir_for_parsed_dir(parsed)
  73. assert raw.name == "report.pdf.mineru_raw"
  74. assert raw.parent == parsed.parent
  75. # ---------------------------------------------------------------------------
  76. # Validation: happy path + every individual failure mode
  77. # ---------------------------------------------------------------------------
  78. @pytest.mark.offline
  79. def test_is_bundle_valid_happy_path(
  80. fresh_bundle: tuple[Path, Manifest], source_file: Path
  81. ) -> None:
  82. raw, _ = fresh_bundle
  83. assert is_bundle_valid(raw, source_file) is True
  84. @pytest.mark.offline
  85. def test_invalid_when_manifest_missing(
  86. fresh_bundle: tuple[Path, Manifest], source_file: Path
  87. ) -> None:
  88. raw, _ = fresh_bundle
  89. (raw / "_manifest.json").unlink()
  90. assert is_bundle_valid(raw, source_file) is False
  91. @pytest.mark.offline
  92. def test_invalid_when_manifest_malformed(
  93. fresh_bundle: tuple[Path, Manifest], source_file: Path
  94. ) -> None:
  95. raw, _ = fresh_bundle
  96. (raw / "_manifest.json").write_text("not json")
  97. assert is_bundle_valid(raw, source_file) is False
  98. @pytest.mark.offline
  99. def test_invalid_when_manifest_wrong_engine(
  100. fresh_bundle: tuple[Path, Manifest], source_file: Path
  101. ) -> None:
  102. raw, _ = fresh_bundle
  103. payload = json.loads((raw / "_manifest.json").read_text())
  104. payload["engine"] = "docling"
  105. (raw / "_manifest.json").write_text(json.dumps(payload))
  106. assert is_bundle_valid(raw, source_file) is False
  107. @pytest.mark.offline
  108. def test_invalid_when_source_size_changes(
  109. fresh_bundle: tuple[Path, Manifest], source_file: Path
  110. ) -> None:
  111. # Append bytes to the source file — size diverges from manifest fast-path.
  112. raw, _ = fresh_bundle
  113. with source_file.open("ab") as fh:
  114. fh.write(b"x")
  115. assert is_bundle_valid(raw, source_file) is False
  116. @pytest.mark.offline
  117. def test_invalid_when_source_hash_changes_but_size_same(
  118. fresh_bundle: tuple[Path, Manifest], source_file: Path
  119. ) -> None:
  120. """In-place rewrite that preserves byte size but mutates content. The
  121. fast-path passes but the full hash check catches it."""
  122. raw, _ = fresh_bundle
  123. data = source_file.read_bytes()
  124. # Flip first byte; keep length identical.
  125. mutated = bytes([data[0] ^ 0xFF]) + data[1:]
  126. assert len(mutated) == len(data)
  127. source_file.write_bytes(mutated)
  128. assert is_bundle_valid(raw, source_file) is False
  129. @pytest.mark.offline
  130. def test_invalid_when_engine_version_mismatch(
  131. fresh_bundle: tuple[Path, Manifest],
  132. source_file: Path,
  133. monkeypatch: pytest.MonkeyPatch,
  134. ) -> None:
  135. raw, _ = fresh_bundle
  136. payload = json.loads((raw / "_manifest.json").read_text())
  137. payload["engine_version"] = "magic-pdf 1.5.4"
  138. (raw / "_manifest.json").write_text(json.dumps(payload))
  139. monkeypatch.setenv("MINERU_ENGINE_VERSION", "magic-pdf 1.6.0")
  140. assert is_bundle_valid(raw, source_file) is False
  141. @pytest.mark.offline
  142. def test_engine_version_match_passes(
  143. fresh_bundle: tuple[Path, Manifest],
  144. source_file: Path,
  145. monkeypatch: pytest.MonkeyPatch,
  146. ) -> None:
  147. raw, _ = fresh_bundle
  148. payload = json.loads((raw / "_manifest.json").read_text())
  149. payload["engine_version"] = "magic-pdf 1.5.4"
  150. (raw / "_manifest.json").write_text(json.dumps(payload))
  151. monkeypatch.setenv("MINERU_ENGINE_VERSION", "magic-pdf 1.5.4")
  152. assert is_bundle_valid(raw, source_file) is True
  153. @pytest.mark.offline
  154. def test_engine_version_skip_when_either_side_blank(
  155. fresh_bundle: tuple[Path, Manifest],
  156. source_file: Path,
  157. monkeypatch: pytest.MonkeyPatch,
  158. ) -> None:
  159. """Blank manifest engine_version + non-blank env should NOT invalidate
  160. (no signal from manifest); same for the reverse."""
  161. raw, _ = fresh_bundle
  162. # Manifest engine_version is empty by default.
  163. monkeypatch.setenv("MINERU_ENGINE_VERSION", "anything")
  164. assert is_bundle_valid(raw, source_file) is True
  165. @pytest.mark.offline
  166. def test_invalid_when_api_mode_mismatch(
  167. fresh_bundle: tuple[Path, Manifest],
  168. source_file: Path,
  169. monkeypatch: pytest.MonkeyPatch,
  170. ) -> None:
  171. raw, _ = fresh_bundle
  172. payload = json.loads((raw / "_manifest.json").read_text())
  173. payload["api_mode"] = "local"
  174. (raw / "_manifest.json").write_text(json.dumps(payload))
  175. monkeypatch.setenv("MINERU_API_MODE", "official")
  176. monkeypatch.setenv("MINERU_API_TOKEN", "token")
  177. assert is_bundle_valid(raw, source_file) is False
  178. @pytest.mark.offline
  179. def test_invalid_when_options_signature_missing(
  180. fresh_bundle: tuple[Path, Manifest], source_file: Path
  181. ) -> None:
  182. raw, _ = fresh_bundle
  183. payload = json.loads((raw / "_manifest.json").read_text())
  184. payload.pop("options_signature", None)
  185. (raw / "_manifest.json").write_text(json.dumps(payload))
  186. assert is_bundle_valid(raw, source_file) is False
  187. @pytest.mark.offline
  188. @pytest.mark.parametrize(
  189. ("key", "value"),
  190. [
  191. ("MINERU_LOCAL_BACKEND", "pipeline"),
  192. ("MINERU_LOCAL_PARSE_METHOD", "ocr"),
  193. ("MINERU_LOCAL_IMAGE_ANALYSIS", "false"),
  194. ("MINERU_LOCAL_START_PAGE_ID", "1"),
  195. ],
  196. )
  197. def test_invalid_when_local_parser_options_change(
  198. fresh_bundle: tuple[Path, Manifest],
  199. source_file: Path,
  200. monkeypatch: pytest.MonkeyPatch,
  201. key: str,
  202. value: str,
  203. ) -> None:
  204. raw, _ = fresh_bundle
  205. monkeypatch.setenv("MINERU_API_MODE", "local")
  206. monkeypatch.setenv(key, value)
  207. assert is_bundle_valid(raw, source_file) is False
  208. @pytest.mark.offline
  209. @pytest.mark.parametrize(
  210. ("key", "value"),
  211. [
  212. ("MINERU_MODEL_VERSION", "pipeline"),
  213. ("MINERU_IS_OCR", "true"),
  214. ("MINERU_PAGE_RANGES", "1-5"),
  215. ("MINERU_LANGUAGE", "en"),
  216. ("MINERU_ENABLE_TABLE", "false"),
  217. ("MINERU_ENABLE_FORMULA", "false"),
  218. ],
  219. )
  220. def test_invalid_when_official_parser_options_change(
  221. tmp_path: Path,
  222. source_file: Path,
  223. monkeypatch: pytest.MonkeyPatch,
  224. key: str,
  225. value: str,
  226. ) -> None:
  227. """Symmetric coverage for the official-mode partition of the signature.
  228. Build a bundle whose ``options_signature`` reflects the official defaults,
  229. sanity-check that it validates, then flip ``key`` and assert a cache miss.
  230. """
  231. monkeypatch.setenv("MINERU_API_MODE", "official")
  232. raw = tmp_path / "src.mineru_raw"
  233. raw.mkdir()
  234. content_list = raw / "content_list.json"
  235. content_list.write_text('[{"type":"text","text":"hi"}]', encoding="utf-8")
  236. crit_size, crit_hash = compute_size_and_hash(content_list)
  237. src_size, src_hash = compute_size_and_hash(source_file)
  238. manifest = Manifest(
  239. source_content_hash=src_hash,
  240. source_size_bytes=src_size,
  241. source_filename_at_parse=source_file.name,
  242. critical_file=ManifestFile(
  243. path="content_list.json", size=crit_size, sha256=crit_hash
  244. ),
  245. files=[],
  246. total_size_bytes=crit_size,
  247. task_id="task-official",
  248. api_mode="official",
  249. options_signature=current_mineru_options_signature(),
  250. )
  251. write_manifest(raw, manifest)
  252. assert is_bundle_valid(raw, source_file) is True
  253. monkeypatch.setenv(key, value)
  254. assert is_bundle_valid(raw, source_file) is False
  255. @pytest.mark.offline
  256. def test_invalid_when_endpoint_signature_mismatch(
  257. fresh_bundle: tuple[Path, Manifest],
  258. source_file: Path,
  259. monkeypatch: pytest.MonkeyPatch,
  260. ) -> None:
  261. raw, _ = fresh_bundle
  262. payload = json.loads((raw / "_manifest.json").read_text())
  263. payload["api_mode"] = "local"
  264. payload["endpoint_signature"] = "http://old.example"
  265. (raw / "_manifest.json").write_text(json.dumps(payload))
  266. monkeypatch.setenv("MINERU_API_MODE", "local")
  267. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://new.example")
  268. assert is_bundle_valid(raw, source_file) is False
  269. @pytest.mark.offline
  270. def test_endpoint_signature_uses_mode_specific_endpoint(
  271. fresh_bundle: tuple[Path, Manifest],
  272. source_file: Path,
  273. monkeypatch: pytest.MonkeyPatch,
  274. ) -> None:
  275. raw, _ = fresh_bundle
  276. payload = json.loads((raw / "_manifest.json").read_text())
  277. payload["api_mode"] = "local"
  278. payload["endpoint_signature"] = "http://old.example"
  279. (raw / "_manifest.json").write_text(json.dumps(payload))
  280. monkeypatch.setenv("MINERU_API_MODE", "local")
  281. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://new.example")
  282. assert is_bundle_valid(raw, source_file) is False
  283. @pytest.mark.offline
  284. def test_endpoint_signature_ignores_trailing_slash(
  285. fresh_bundle: tuple[Path, Manifest],
  286. source_file: Path,
  287. monkeypatch: pytest.MonkeyPatch,
  288. ) -> None:
  289. raw, _ = fresh_bundle
  290. payload = json.loads((raw / "_manifest.json").read_text())
  291. payload["api_mode"] = "local"
  292. payload["endpoint_signature"] = "http://old.example"
  293. (raw / "_manifest.json").write_text(json.dumps(payload))
  294. monkeypatch.setenv("MINERU_API_MODE", "local")
  295. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://old.example/")
  296. assert is_bundle_valid(raw, source_file) is True
  297. @pytest.mark.offline
  298. def test_invalid_when_critical_file_missing(
  299. fresh_bundle: tuple[Path, Manifest], source_file: Path
  300. ) -> None:
  301. raw, _ = fresh_bundle
  302. (raw / "content_list.json").unlink()
  303. assert is_bundle_valid(raw, source_file) is False
  304. @pytest.mark.offline
  305. def test_invalid_when_critical_file_size_changes(
  306. fresh_bundle: tuple[Path, Manifest], source_file: Path
  307. ) -> None:
  308. raw, _ = fresh_bundle
  309. cl = raw / "content_list.json"
  310. cl.write_text(cl.read_text() + "/* extra */")
  311. assert is_bundle_valid(raw, source_file) is False
  312. @pytest.mark.offline
  313. def test_invalid_when_critical_file_hash_changes(
  314. fresh_bundle: tuple[Path, Manifest], source_file: Path
  315. ) -> None:
  316. """Same size, different bytes. sha256 is the terminal check."""
  317. raw, _ = fresh_bundle
  318. cl = raw / "content_list.json"
  319. data = cl.read_text()
  320. mutated = data[:-1] + "X" # swap last char; size preserved
  321. assert len(mutated) == len(data)
  322. cl.write_text(mutated)
  323. assert is_bundle_valid(raw, source_file) is False
  324. @pytest.mark.offline
  325. def test_invalid_when_aux_file_size_changes(
  326. fresh_bundle: tuple[Path, Manifest], source_file: Path
  327. ) -> None:
  328. raw, _ = fresh_bundle
  329. p = raw / "images" / "img1.png"
  330. p.write_bytes(p.read_bytes() + b"corruption")
  331. assert is_bundle_valid(raw, source_file) is False
  332. @pytest.mark.offline
  333. def test_invalid_when_aux_file_missing(
  334. fresh_bundle: tuple[Path, Manifest], source_file: Path
  335. ) -> None:
  336. raw, _ = fresh_bundle
  337. (raw / "images" / "img2.png").unlink()
  338. assert is_bundle_valid(raw, source_file) is False
  339. # ---------------------------------------------------------------------------
  340. # Helpers
  341. # ---------------------------------------------------------------------------
  342. @pytest.mark.offline
  343. def test_clear_dir_contents_preserves_directory(tmp_path: Path) -> None:
  344. d = tmp_path / "raw"
  345. d.mkdir()
  346. (d / "a.txt").write_text("a")
  347. (d / "sub").mkdir()
  348. (d / "sub" / "b.txt").write_text("b")
  349. clear_dir_contents(d)
  350. assert d.exists()
  351. assert list(d.iterdir()) == []
  352. @pytest.mark.offline
  353. def test_compute_size_and_hash_consistency(tmp_path: Path) -> None:
  354. """Both values describe the same byte stream."""
  355. p = tmp_path / "f.bin"
  356. payload = b"abc" * 1000
  357. p.write_bytes(payload)
  358. size, h = compute_size_and_hash(p)
  359. assert size == len(payload)
  360. assert h.startswith("sha256:") and len(h) == len("sha256:") + 64
  361. @pytest.mark.offline
  362. def test_manifest_round_trip_via_disk(tmp_path: Path) -> None:
  363. """Write → read recovers all fields."""
  364. raw = tmp_path / "rt.mineru_raw"
  365. raw.mkdir()
  366. m = Manifest(
  367. source_content_hash="sha256:abc",
  368. source_size_bytes=10,
  369. source_filename_at_parse="x.pdf",
  370. critical_file=ManifestFile(
  371. path="content_list.json", size=5, sha256="sha256:cl"
  372. ),
  373. files=[ManifestFile(path="images/i.png", size=3)],
  374. total_size_bytes=8,
  375. task_id="t1",
  376. engine_version="v",
  377. endpoint_signature="ep",
  378. options_signature="sha256:opts",
  379. )
  380. write_manifest(raw, m)
  381. from lightrag.parser.external.mineru.manifest import load_manifest
  382. loaded = load_manifest(raw)
  383. assert loaded is not None
  384. assert loaded.source_content_hash == "sha256:abc"
  385. assert loaded.critical_file.sha256 == "sha256:cl"
  386. assert [f.path for f in loaded.files] == ["images/i.png"]
  387. assert loaded.task_id == "t1"
  388. assert loaded.options_signature == "sha256:opts"