test_parse_mineru_sidecar.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. """Integration tests for ``parse_mineru`` with the unified sidecar pipeline.
  2. These tests stub :class:`MinerURawClient.download_into` so no real MinerU
  3. service is contacted; the focus is on:
  4. - happy path: cache miss → download → sidecar emitted with all expected
  5. files in the spec-compliant locations
  6. - cache hit: a pre-existing valid ``*.mineru_raw/`` + manifest causes
  7. ``MinerURawClient.download_into`` NOT to be called
  8. - ``LIGHTRAG_FORCE_REPARSE_MINERU=true`` forces a re-download even when
  9. the manifest is valid
  10. """
  11. from __future__ import annotations
  12. import asyncio
  13. import json
  14. from pathlib import Path
  15. from typing import Any
  16. import numpy as np
  17. import pytest
  18. from lightrag import LightRAG
  19. from lightrag.constants import (
  20. FULL_DOCS_FORMAT_LIGHTRAG,
  21. )
  22. from lightrag.parser.external.mineru import compute_size_and_hash
  23. from lightrag.parser.external.mineru.cache import current_mineru_options_signature
  24. from lightrag.parser.external.mineru.manifest import (
  25. Manifest,
  26. ManifestFile,
  27. write_manifest,
  28. )
  29. from lightrag.utils import EmbeddingFunc, Tokenizer
  30. class _SimpleTokenizerImpl:
  31. def encode(self, content: str) -> list[int]:
  32. return [ord(ch) for ch in content]
  33. def decode(self, tokens: list[int]) -> str:
  34. return "".join(chr(t) for t in tokens)
  35. async def _mock_embedding(texts: list[str]) -> np.ndarray:
  36. return np.random.rand(len(texts), 32)
  37. async def _mock_llm(prompt: Any, **kwargs: Any) -> str:
  38. return '{"name":"x","summary":"s","detail_description":"d"}'
  39. def _new_rag(tmp_path: Path) -> LightRAG:
  40. return LightRAG(
  41. working_dir=str(tmp_path),
  42. workspace=f"test-mineru-sidecar-{tmp_path.name}",
  43. llm_model_func=_mock_llm,
  44. embedding_func=EmbeddingFunc(
  45. embedding_dim=32,
  46. max_token_size=4096,
  47. func=_mock_embedding,
  48. ),
  49. tokenizer=Tokenizer("mock-tokenizer", _SimpleTokenizerImpl()),
  50. vlm_process_enable=False,
  51. )
  52. _FAKE_CONTENT_LIST = [
  53. {"type": "text", "text": "1 Introduction", "text_level": 1},
  54. {"type": "text", "text": "Body paragraph."},
  55. {
  56. "type": "table",
  57. "table_body": [["A", "B"], ["1", "2"]],
  58. "num_rows": 2,
  59. "num_cols": 2,
  60. "table_caption": ["Tbl"],
  61. "page_idx": 0,
  62. "bbox": [10, 10, 100, 50],
  63. },
  64. {
  65. "type": "image",
  66. "img_path": "images/img_001.jpg",
  67. "image_caption": ["Fig 1"],
  68. "page_idx": 1,
  69. "bbox": [20, 20, 200, 100],
  70. },
  71. {"type": "equation", "text": "$E = mc^2$", "caption": "Eq 1", "page_idx": 1},
  72. ]
  73. def _install_fake_download(monkeypatch: pytest.MonkeyPatch) -> dict[str, int]:
  74. """Replace :meth:`MinerURawClient.download_into` with a recorder that
  75. writes a synthetic bundle (content_list.json + one image + manifest).
  76. """
  77. import lightrag.parser.external.mineru.client as client_mod
  78. counters = {"calls": 0, "upload_names": []}
  79. async def _fake_download(
  80. self,
  81. raw_dir: Path,
  82. source_file_path: Path,
  83. *,
  84. upload_name: str | None = None,
  85. ):
  86. counters["calls"] += 1
  87. counters["upload_names"].append(upload_name)
  88. raw_dir.mkdir(parents=True, exist_ok=True)
  89. (raw_dir / "content_list.json").write_text(
  90. json.dumps(_FAKE_CONTENT_LIST, ensure_ascii=False),
  91. encoding="utf-8",
  92. )
  93. (raw_dir / "images").mkdir(exist_ok=True)
  94. (raw_dir / "images" / "img_001.jpg").write_bytes(b"\xff\xd8\xff\xe0fakeJPEG")
  95. src_size, src_hash = compute_size_and_hash(source_file_path)
  96. crit_size, crit_hash = compute_size_and_hash(raw_dir / "content_list.json")
  97. files = [
  98. ManifestFile(
  99. path="images/img_001.jpg",
  100. size=(raw_dir / "images" / "img_001.jpg").stat().st_size,
  101. )
  102. ]
  103. manifest = Manifest(
  104. source_content_hash=src_hash,
  105. source_size_bytes=src_size,
  106. source_filename_at_parse=upload_name or source_file_path.name,
  107. critical_file=ManifestFile(
  108. path="content_list.json", size=crit_size, sha256=crit_hash
  109. ),
  110. files=files,
  111. total_size_bytes=crit_size + sum(f.size for f in files),
  112. task_id=f"fake-{counters['calls']}",
  113. api_mode="local",
  114. options_signature=current_mineru_options_signature(),
  115. )
  116. write_manifest(raw_dir, manifest)
  117. return manifest
  118. monkeypatch.setattr(client_mod.MinerURawClient, "download_into", _fake_download)
  119. return counters
  120. @pytest.mark.offline
  121. def test_parse_mineru_emits_compliant_sidecar(
  122. tmp_path: Path, monkeypatch: pytest.MonkeyPatch
  123. ) -> None:
  124. """End-to-end: parse_mineru produces *.parsed/ with spec-compliant
  125. blocks.jsonl + per-modality JSONs + assets dir; *.mineru_raw/ kept."""
  126. async def _run() -> None:
  127. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://mineru.example")
  128. counters = _install_fake_download(monkeypatch)
  129. # Don't move the source out from under the cache validator between
  130. # repeated parse_mineru calls.
  131. async def _noop_archive(_p: str) -> None:
  132. return None
  133. import lightrag.pipeline as pipeline_module
  134. monkeypatch.setattr(
  135. pipeline_module,
  136. "archive_docx_source_after_full_docs_sync",
  137. _noop_archive,
  138. )
  139. input_dir = tmp_path / "inputs" / "ws"
  140. input_dir.mkdir(parents=True)
  141. src = input_dir / "demo.pdf"
  142. src.write_bytes(b"PDFPDF" * 256)
  143. rag = _new_rag(tmp_path)
  144. await rag.initialize_storages()
  145. try:
  146. doc_id = "doc-abcdef0123456789abcdef0123456789"
  147. await rag.doc_status.upsert(
  148. {
  149. doc_id: {
  150. "status": "PARSING",
  151. "content_summary": "",
  152. "content_length": 0,
  153. "chunks_count": 0,
  154. "chunks_list": [],
  155. "created_at": "2026-05-15T00:00:00+00:00",
  156. "updated_at": "2026-05-15T00:00:00+00:00",
  157. "file_path": "demo.pdf",
  158. "track_id": "trk",
  159. "content_hash": "",
  160. "metadata": {},
  161. }
  162. }
  163. )
  164. monkeypatch.setattr(
  165. rag,
  166. "_resolve_source_file_for_parser",
  167. lambda _p: str(src),
  168. )
  169. parsed = await rag.parse_mineru(
  170. doc_id=doc_id,
  171. file_path="demo.pdf",
  172. content_data={},
  173. )
  174. assert counters["calls"] == 1, "download_into should run once on miss"
  175. parsed_dir = Path(parsed["blocks_path"]).parent
  176. assert parsed["parse_format"] == FULL_DOCS_FORMAT_LIGHTRAG
  177. assert parsed_dir.name == "demo.pdf.parsed"
  178. # Sidecar files present
  179. files = {p.name for p in parsed_dir.iterdir() if p.is_file()}
  180. assert "demo.blocks.jsonl" in files
  181. assert "demo.tables.json" in files
  182. assert "demo.drawings.json" in files
  183. assert "demo.equations.json" in files
  184. assert (parsed_dir / "demo.blocks.assets").is_dir()
  185. assert (parsed_dir / "demo.blocks.assets" / "img_001.jpg").is_file()
  186. # Content of blocks.jsonl
  187. blocks_raw = (parsed_dir / "demo.blocks.jsonl").read_text()
  188. lines = blocks_raw.splitlines()
  189. meta = json.loads(lines[0])
  190. rows = [json.loads(line) for line in lines[1:]]
  191. assert meta["parse_engine"] == "mineru"
  192. assert meta["table_file"] is True
  193. assert meta["drawing_file"] is True
  194. assert meta["equation_file"] is True
  195. assert meta["asset_dir"] is True
  196. assert meta["doc_title"] == "1 Introduction"
  197. # bbox_attributes present for mineru (PDF coordinate context)
  198. assert meta["bbox_attributes"] == {"origin": "LEFTTOP", "max": 1000}
  199. # Spec fix: <table> placeholder inline, not <cite>
  200. contents = " ".join(row.get("content", "") for row in rows)
  201. assert '<table id="tb-' in contents
  202. assert 'format="json"' in contents
  203. assert "<cite" not in contents
  204. # bbox positions present on at least one block
  205. assert any(
  206. p.get("type") == "bbox"
  207. for row in rows
  208. for p in row.get("positions") or []
  209. )
  210. # Drawing path points inside *.blocks.assets/
  211. drawings = json.loads((parsed_dir / "demo.drawings.json").read_text())[
  212. "drawings"
  213. ]
  214. (drawing_id, drawing_item) = next(iter(drawings.items()))
  215. assert drawing_id.startswith("im-")
  216. assert drawing_item["path"] == "demo.blocks.assets/img_001.jpg"
  217. assert drawing_item["self_ref"] == "content_list.json#/3"
  218. # Raw bundle preserved next to sidecar
  219. raw_dir = parsed_dir.parent / "demo.pdf.mineru_raw"
  220. assert (raw_dir / "_manifest.json").is_file()
  221. assert (raw_dir / "content_list.json").is_file()
  222. assert (raw_dir / "images" / "img_001.jpg").is_file()
  223. # No legacy non-spec image field on tables
  224. tables = json.loads((parsed_dir / "demo.tables.json").read_text())["tables"]
  225. (_, table_item) = next(iter(tables.items()))
  226. assert "image" not in table_item
  227. assert table_item["self_ref"] == "content_list.json#/2"
  228. equations = json.loads((parsed_dir / "demo.equations.json").read_text())[
  229. "equations"
  230. ]
  231. (_, equation_item) = next(iter(equations.items()))
  232. assert equation_item["self_ref"] == "content_list.json#/4"
  233. finally:
  234. await rag.finalize_storages()
  235. asyncio.new_event_loop().run_until_complete(_run())
  236. @pytest.mark.offline
  237. def test_parse_mineru_cache_hit_skips_download(
  238. tmp_path: Path, monkeypatch: pytest.MonkeyPatch
  239. ) -> None:
  240. """A pre-existing valid bundle short-circuits the network call entirely."""
  241. async def _run() -> None:
  242. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://mineru.example")
  243. counters = _install_fake_download(monkeypatch)
  244. # Don't move the source out from under the cache validator between
  245. # repeated parse_mineru calls.
  246. async def _noop_archive(_p: str) -> None:
  247. return None
  248. import lightrag.pipeline as pipeline_module
  249. monkeypatch.setattr(
  250. pipeline_module,
  251. "archive_docx_source_after_full_docs_sync",
  252. _noop_archive,
  253. )
  254. input_dir = tmp_path / "inputs" / "ws"
  255. input_dir.mkdir(parents=True)
  256. src = input_dir / "demo.pdf"
  257. src.write_bytes(b"PDFPDF" * 256)
  258. rag = _new_rag(tmp_path)
  259. await rag.initialize_storages()
  260. try:
  261. doc_id = "doc-abcdef0123456789abcdef0123456789"
  262. await rag.doc_status.upsert(
  263. {
  264. doc_id: {
  265. "status": "PARSING",
  266. "content_summary": "",
  267. "content_length": 0,
  268. "chunks_count": 0,
  269. "chunks_list": [],
  270. "created_at": "2026-05-15T00:00:00+00:00",
  271. "updated_at": "2026-05-15T00:00:00+00:00",
  272. "file_path": "demo.pdf",
  273. "track_id": "trk",
  274. "content_hash": "",
  275. "metadata": {},
  276. }
  277. }
  278. )
  279. monkeypatch.setattr(
  280. rag,
  281. "_resolve_source_file_for_parser",
  282. lambda _p: str(src),
  283. )
  284. # First call: cache miss → download once.
  285. await rag.parse_mineru(
  286. doc_id=doc_id,
  287. file_path="demo.pdf",
  288. content_data={},
  289. )
  290. assert counters["calls"] == 1
  291. # Second call: should hit cache.
  292. await rag.parse_mineru(
  293. doc_id=doc_id,
  294. file_path="demo.pdf",
  295. content_data={},
  296. )
  297. assert counters["calls"] == 1, "cache hit must not re-download"
  298. # Third call with force-reparse: cache invalidated.
  299. monkeypatch.setenv("LIGHTRAG_FORCE_REPARSE_MINERU", "true")
  300. await rag.parse_mineru(
  301. doc_id=doc_id,
  302. file_path="demo.pdf",
  303. content_data={},
  304. )
  305. assert counters["calls"] == 2
  306. finally:
  307. await rag.finalize_storages()
  308. asyncio.new_event_loop().run_until_complete(_run())
  309. @pytest.mark.offline
  310. def test_parse_mineru_upload_name_strips_parser_hint(
  311. tmp_path: Path, monkeypatch: pytest.MonkeyPatch
  312. ) -> None:
  313. """MinerU upload name should use the canonical filename, not parser
  314. hints embedded in the source basename."""
  315. async def _run() -> None:
  316. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://mineru.example")
  317. counters = _install_fake_download(monkeypatch)
  318. input_dir = tmp_path / "inputs" / "ws"
  319. input_dir.mkdir(parents=True)
  320. src = input_dir / "demo.[mineru-iet].pdf"
  321. src.write_bytes(b"PDFPDF" * 256)
  322. rag = _new_rag(tmp_path)
  323. await rag.initialize_storages()
  324. try:
  325. doc_id = "doc-abcdef0123456789abcdef0123456789"
  326. await rag.doc_status.upsert(
  327. {
  328. doc_id: {
  329. "status": "PARSING",
  330. "content_summary": "",
  331. "content_length": 0,
  332. "chunks_count": 0,
  333. "chunks_list": [],
  334. "created_at": "2026-05-15T00:00:00+00:00",
  335. "updated_at": "2026-05-15T00:00:00+00:00",
  336. "file_path": src.name,
  337. "track_id": "trk",
  338. "content_hash": "",
  339. "metadata": {},
  340. }
  341. }
  342. )
  343. monkeypatch.setattr(
  344. rag,
  345. "_resolve_source_file_for_parser",
  346. lambda _p: str(src),
  347. )
  348. parsed = await rag.parse_mineru(
  349. doc_id=doc_id,
  350. file_path=src.name,
  351. content_data={},
  352. )
  353. assert counters["upload_names"] == ["demo.pdf"]
  354. parsed_dir = Path(parsed["blocks_path"]).parent
  355. assert parsed_dir.name == "demo.pdf.parsed"
  356. manifest = json.loads(
  357. (
  358. parsed_dir.parent / "demo.pdf.mineru_raw" / "_manifest.json"
  359. ).read_text(encoding="utf-8")
  360. )
  361. assert manifest["source_filename_at_parse"] == "demo.pdf"
  362. finally:
  363. await rag.finalize_storages()
  364. asyncio.new_event_loop().run_until_complete(_run())
  365. @pytest.mark.offline
  366. def test_parse_mineru_cache_invalidates_on_source_change(
  367. tmp_path: Path, monkeypatch: pytest.MonkeyPatch
  368. ) -> None:
  369. """Source file content swapped (same/different size) → cache miss."""
  370. async def _run() -> None:
  371. monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://mineru.example")
  372. counters = _install_fake_download(monkeypatch)
  373. # Don't move the source out from under the cache validator between
  374. # repeated parse_mineru calls.
  375. async def _noop_archive(_p: str) -> None:
  376. return None
  377. import lightrag.pipeline as pipeline_module
  378. monkeypatch.setattr(
  379. pipeline_module,
  380. "archive_docx_source_after_full_docs_sync",
  381. _noop_archive,
  382. )
  383. input_dir = tmp_path / "inputs" / "ws"
  384. input_dir.mkdir(parents=True)
  385. src = input_dir / "demo.pdf"
  386. src.write_bytes(b"PDFPDF" * 256)
  387. rag = _new_rag(tmp_path)
  388. await rag.initialize_storages()
  389. try:
  390. doc_id = "doc-abcdef0123456789abcdef0123456789"
  391. await rag.doc_status.upsert(
  392. {
  393. doc_id: {
  394. "status": "PARSING",
  395. "content_summary": "",
  396. "content_length": 0,
  397. "chunks_count": 0,
  398. "chunks_list": [],
  399. "created_at": "2026-05-15T00:00:00+00:00",
  400. "updated_at": "2026-05-15T00:00:00+00:00",
  401. "file_path": "demo.pdf",
  402. "track_id": "trk",
  403. "content_hash": "",
  404. "metadata": {},
  405. }
  406. }
  407. )
  408. monkeypatch.setattr(
  409. rag,
  410. "_resolve_source_file_for_parser",
  411. lambda _p: str(src),
  412. )
  413. await rag.parse_mineru(
  414. doc_id=doc_id,
  415. file_path="demo.pdf",
  416. content_data={},
  417. )
  418. assert counters["calls"] == 1
  419. # Same length, different bytes → fast-path passes, hash fails.
  420. data = src.read_bytes()
  421. src.write_bytes(b"\x00" + data[1:])
  422. await rag.parse_mineru(
  423. doc_id=doc_id,
  424. file_path="demo.pdf",
  425. content_data={},
  426. )
  427. assert counters["calls"] == 2
  428. finally:
  429. await rag.finalize_storages()
  430. asyncio.new_event_loop().run_until_complete(_run())