test_parse_native_lightrag_e2e.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. """End-to-end test: native docx → LightRAG Document → stable cache key.
  2. The original bug this guards against: ``parse_native`` used to write a
  3. runtime-stamped structured parser payload into ``full_docs.content``, so
  4. re-parsing the same docx produced different
  5. chunk-0 content and therefore different LLM cache keys.
  6. After the fix, ``parse_native`` writes ``.blocks.jsonl`` + sidecars and
  7. ``full_docs`` is in LIGHTRAG format. ``_load_lightrag_document_content``
  8. skips the ``meta`` line (which contains ``parse_time``) and concatenates
  9. only ``"type": "content"`` rows, so re-parsing must yield byte-identical
  10. ``merged_text`` and stable downstream chunk-0 content.
  11. """
  12. import asyncio
  13. import json
  14. from pathlib import Path
  15. import pytest
  16. from lightrag import LightRAG
  17. from lightrag.constants import (
  18. FULL_DOCS_FORMAT_PENDING_PARSE,
  19. PARSED_DIR_NAME,
  20. )
  21. from lightrag.utils import Tokenizer, TokenizerInterface, compute_args_hash
  22. def _block(content, *, heading="", level=0, parent=None, uuid="p1"):
  23. """Build a synthetic block dict matching extract_docx_blocks output."""
  24. return {
  25. "uuid": uuid,
  26. "uuid_end": uuid,
  27. "heading": heading,
  28. "content": content,
  29. "type": "text",
  30. "parent_headings": list(parent or []),
  31. "level": level,
  32. "table_chunk_role": "none",
  33. }
  34. class _MiniFullDocs:
  35. def __init__(self):
  36. self.data = {}
  37. async def upsert(self, payload):
  38. self.data.update(payload)
  39. async def get_by_id(self, doc_id):
  40. return self.data.get(doc_id)
  41. async def index_done_callback(self):
  42. return None
  43. class _MiniDocStatus:
  44. async def get_by_id(self, doc_id):
  45. return None
  46. async def upsert(self, data):
  47. return None
  48. class _CharTokenizer(TokenizerInterface):
  49. def encode(self, content: str):
  50. return [ord(ch) for ch in content]
  51. def decode(self, tokens):
  52. return "".join(chr(t) for t in tokens)
  53. class _MiniRag:
  54. """Just enough surface for parse_native + parser/docx adapter."""
  55. _persist_parsed_full_docs = LightRAG._persist_parsed_full_docs
  56. def __init__(self, working_dir):
  57. self.working_dir = str(working_dir)
  58. self.full_docs = _MiniFullDocs()
  59. self.doc_status = _MiniDocStatus()
  60. self.tokenizer = Tokenizer(model_name="char", tokenizer=_CharTokenizer())
  61. def _resolve_source_file_for_parser(self, file_path):
  62. return file_path
  63. @pytest.mark.offline
  64. def test_native_lightrag_path_produces_stable_merged_text(tmp_path, monkeypatch):
  65. """Re-parsing the same docx must yield byte-identical merged_text and
  66. therefore identical chunk_args_hash on chunk-0."""
  67. async def _run():
  68. input_dir = tmp_path / "input"
  69. input_dir.mkdir()
  70. monkeypatch.setenv("INPUT_DIR", str(input_dir))
  71. source_path = input_dir / "stable.docx"
  72. source_path.write_bytes(b"fake docx bytes")
  73. # Stub extract_docx_blocks at the adapter so the upstream DOCX
  74. # parser is never invoked. The adapter still does all the
  75. # LightRAG-specific writing — that is what we want under test.
  76. stable_blocks = [
  77. _block(
  78. "Title\nFirst paragraph body.\nSecond paragraph body.",
  79. heading="Title",
  80. level=1,
  81. ),
  82. ]
  83. def _stub_extract(file_path, fixlevel=None, drawing_context=None, **kwargs):
  84. return [dict(b) for b in stable_blocks]
  85. monkeypatch.setattr(
  86. "lightrag.parser.docx.parse_document.extract_docx_blocks",
  87. _stub_extract,
  88. )
  89. rag = _MiniRag(tmp_path / "work")
  90. # ---- First parse ----
  91. # parse_native archives the source after writing, so re-create it
  92. # before the second parse for a fair comparison.
  93. result1 = await LightRAG.parse_native(
  94. rag,
  95. "doc-stable",
  96. str(source_path),
  97. {"parse_format": FULL_DOCS_FORMAT_PENDING_PARSE, "content": ""},
  98. )
  99. merged1 = result1["content"]
  100. assert merged1, "first parse produced empty merged_text"
  101. # ---- Second parse ----
  102. # Restore the source file (archive moved it), reset the in-memory
  103. # full_docs row, and remove the parsed_dir so the writer rewrites
  104. # both meta (with a fresh parse_time) and content lines.
  105. source_path.write_bytes(b"fake docx bytes")
  106. rag.full_docs.data.clear()
  107. parsed_artifact_dir = input_dir / PARSED_DIR_NAME / f"{source_path.name}.parsed"
  108. if parsed_artifact_dir.exists():
  109. import shutil
  110. shutil.rmtree(parsed_artifact_dir)
  111. result2 = await LightRAG.parse_native(
  112. rag,
  113. "doc-stable",
  114. str(source_path),
  115. {"parse_format": FULL_DOCS_FORMAT_PENDING_PARSE, "content": ""},
  116. )
  117. merged2 = result2["content"]
  118. # Core invariant: merged_text byte-identical across runs even
  119. # though parse_time in the .blocks.jsonl meta line differs.
  120. assert merged1 == merged2
  121. # And: a hash computed over a chunk-0 derived from merged_text
  122. # must also be identical — that is what powers LLM cache hits.
  123. prompt_template = "EXTRACT_PROMPT::{text}"
  124. chunk0_a = prompt_template.format(text=merged1[:200])
  125. chunk0_b = prompt_template.format(text=merged2[:200])
  126. assert chunk0_a == chunk0_b
  127. assert compute_args_hash(chunk0_a) == compute_args_hash(chunk0_b)
  128. # And: full_docs.content uses the {{LRdoc}} marker plus a leading
  129. # summary derived from merged_text (not the legacy placeholder).
  130. record = rag.full_docs.data["doc-stable"]
  131. assert record["parse_format"] == "lightrag"
  132. assert record["content"].startswith("{{LRdoc}}")
  133. assert merged1[:40] in record["content"]
  134. asyncio.run(_run())
  135. @pytest.mark.offline
  136. def test_native_lightrag_path_writes_blocks_jsonl_and_skips_meta_on_load(
  137. tmp_path, monkeypatch
  138. ):
  139. """Sanity check: ``_load_lightrag_document_content`` must skip the
  140. meta line (where the runtime ``parse_time`` lives) and only return
  141. body content. This is what lets re-parsing produce stable text."""
  142. async def _run():
  143. input_dir = tmp_path / "input"
  144. input_dir.mkdir()
  145. monkeypatch.setenv("INPUT_DIR", str(input_dir))
  146. source_path = input_dir / "skipmeta.docx"
  147. source_path.write_bytes(b"fake")
  148. def _stub_extract(file_path, fixlevel=None, drawing_context=None, **kwargs):
  149. return [_block("the body")]
  150. monkeypatch.setattr(
  151. "lightrag.parser.docx.parse_document.extract_docx_blocks",
  152. _stub_extract,
  153. )
  154. rag = _MiniRag(tmp_path / "work")
  155. result = await LightRAG.parse_native(
  156. rag,
  157. "doc-skip",
  158. str(source_path),
  159. {"parse_format": FULL_DOCS_FORMAT_PENDING_PARSE, "content": ""},
  160. )
  161. # The .blocks.jsonl on disk DOES contain "parse_time" inside the
  162. # meta line; the merged_text returned by parse_native MUST NOT.
  163. blocks_path = result["blocks_path"]
  164. on_disk = open(blocks_path, "r", encoding="utf-8").read()
  165. assert "parse_time" in on_disk
  166. assert "parse_time" not in result["content"]
  167. assert result["content"].strip() == "the body"
  168. asyncio.run(_run())
  169. @pytest.mark.offline
  170. def test_native_lightrag_path_leaves_unknown_table_caption_empty(tmp_path, monkeypatch):
  171. """The native DOCX parser does not infer table titles, so its table
  172. sidecar must not synthesize captions like ``表1``.
  173. """
  174. async def _run():
  175. input_dir = tmp_path / "input"
  176. input_dir.mkdir()
  177. monkeypatch.setenv("INPUT_DIR", str(input_dir))
  178. source_path = input_dir / "table.docx"
  179. source_path.write_bytes(b"fake")
  180. def _stub_extract(file_path, fixlevel=None, drawing_context=None, **kwargs):
  181. return [_block('before\n<table>[["A"]]</table>\nafter')]
  182. monkeypatch.setattr(
  183. "lightrag.parser.docx.parse_document.extract_docx_blocks",
  184. _stub_extract,
  185. )
  186. rag = _MiniRag(tmp_path / "work")
  187. result = await LightRAG.parse_native(
  188. rag,
  189. "doc-table",
  190. str(source_path),
  191. {"parse_format": FULL_DOCS_FORMAT_PENDING_PARSE, "content": ""},
  192. )
  193. blocks_path = Path(result["blocks_path"])
  194. lines = blocks_path.read_text(encoding="utf-8").splitlines()
  195. block = json.loads(lines[1])
  196. assert "caption=" not in block["content"]
  197. assert "表1" not in block["content"]
  198. tables_path = blocks_path.with_suffix("").with_suffix(".tables.json")
  199. tables = json.loads(tables_path.read_text(encoding="utf-8"))
  200. table_entry = tables["tables"]["tb-table-0001"]
  201. assert table_entry["caption"] == ""
  202. # Surrounding is now backfilled at analyze_multimodal entry, not in
  203. # parse_native — invoke the same routine directly to mirror that.
  204. from lightrag.multimodal_context import enrich_sidecars_with_surrounding
  205. enrich_sidecars_with_surrounding(
  206. blocks_path=str(blocks_path),
  207. enabled_modalities={"tables"},
  208. tokenizer=rag.tokenizer,
  209. )
  210. tables = json.loads(tables_path.read_text(encoding="utf-8"))
  211. table_entry = tables["tables"]["tb-table-0001"]
  212. assert table_entry["surrounding"] == {
  213. "leading": "before\n",
  214. "trailing": "\nafter",
  215. }
  216. asyncio.run(_run())
  217. @pytest.mark.offline
  218. def test_analyze_entrypoint_backfills_surrounding_for_all_sidecars(
  219. tmp_path, monkeypatch
  220. ):
  221. """Surrounding is backfilled at analyze_multimodal entry, covering native
  222. parse output as well as any other sidecar-producing engine."""
  223. async def _run():
  224. input_dir = tmp_path / "input"
  225. input_dir.mkdir()
  226. monkeypatch.setenv("INPUT_DIR", str(input_dir))
  227. source_path = input_dir / "all_modalities.docx"
  228. source_path.write_bytes(b"fake")
  229. def _stub_extract(file_path, fixlevel=None, drawing_context=None, **kwargs):
  230. assert drawing_context is not None
  231. assert drawing_context.export_dir_path is not None
  232. (drawing_context.export_dir_path / "pic.png").write_bytes(b"PNG")
  233. return [
  234. _block(
  235. 'alpha <drawing id="1" format="png" '
  236. 'path="all_modalities.blocks.assets/pic.png" /> beta\n'
  237. '<table>[["A"]]</table> gamma\n'
  238. "<equation>E=mc^2</equation>\n"
  239. "delta"
  240. )
  241. ]
  242. monkeypatch.setattr(
  243. "lightrag.parser.docx.parse_document.extract_docx_blocks",
  244. _stub_extract,
  245. )
  246. rag = _MiniRag(tmp_path / "work")
  247. result = await LightRAG.parse_native(
  248. rag,
  249. "doc-mm",
  250. str(source_path),
  251. {"parse_format": FULL_DOCS_FORMAT_PENDING_PARSE, "content": ""},
  252. )
  253. blocks_path = Path(result["blocks_path"])
  254. base = str(blocks_path)[: -len(".blocks.jsonl")]
  255. # Parse-time sidecars must NOT contain surrounding — that field is
  256. # now produced at analyze_multimodal entry.
  257. for root in ("drawings", "tables", "equations"):
  258. payload = json.loads(Path(base + f".{root}.json").read_text("utf-8"))
  259. for item in payload[root].values():
  260. assert "surrounding" not in item
  261. # Now invoke the same routine analyze_multimodal calls and verify
  262. # all modalities get populated.
  263. from lightrag.multimodal_context import enrich_sidecars_with_surrounding
  264. enrich_sidecars_with_surrounding(
  265. blocks_path=str(blocks_path),
  266. enabled_modalities={"drawings", "tables", "equations"},
  267. tokenizer=rag.tokenizer,
  268. )
  269. for root in ("drawings", "tables", "equations"):
  270. payload = json.loads(Path(base + f".{root}.json").read_text("utf-8"))
  271. items = payload[root]
  272. assert items
  273. for item in items.values():
  274. assert "surrounding" in item
  275. assert set(item["surrounding"]) == {"leading", "trailing"}
  276. asyncio.run(_run())
  277. @pytest.mark.offline
  278. def test_native_lightrag_path_writes_image_assets_to_blocks_assets_dir(
  279. tmp_path, monkeypatch
  280. ):
  281. """Native parsing must drop image bytes into ``<base>.blocks.assets/``
  282. after the adapter creates the parsed dir (which it wipes at the start),
  283. and the drawings sidecar must reference the rewritten ids.
  284. """
  285. from pathlib import Path
  286. async def _run():
  287. input_dir = tmp_path / "input"
  288. input_dir.mkdir()
  289. monkeypatch.setenv("INPUT_DIR", str(input_dir))
  290. source_path = input_dir / "with_pics.docx"
  291. source_path.write_bytes(b"fake")
  292. def _stub_extract(file_path, fixlevel=None, drawing_context=None, **kwargs):
  293. # The adapter already created the asset dir before calling us;
  294. # write the fake image bytes there as a side-effect, then return
  295. # a block whose content references that asset via <drawing .../>.
  296. assert drawing_context is not None
  297. assert drawing_context.export_dir_path is not None
  298. (drawing_context.export_dir_path / "pic.png").write_bytes(b"PNG-BYTES")
  299. return [
  300. _block(
  301. "intro\n"
  302. '<drawing id="1" name="pic" format="png" '
  303. 'path="with_pics.blocks.assets/pic.png" />\n'
  304. "outro",
  305. heading="intro",
  306. level=1,
  307. ),
  308. ]
  309. monkeypatch.setattr(
  310. "lightrag.parser.docx.parse_document.extract_docx_blocks",
  311. _stub_extract,
  312. )
  313. rag = _MiniRag(tmp_path / "work")
  314. result = await LightRAG.parse_native(
  315. rag,
  316. "doc-pic",
  317. str(source_path),
  318. {"parse_format": FULL_DOCS_FORMAT_PENDING_PARSE, "content": ""},
  319. )
  320. blocks_path = Path(result["blocks_path"])
  321. parsed_dir = blocks_path.parent
  322. asset_dir = parsed_dir / "with_pics.blocks.assets"
  323. # Asset dir must exist alongside .blocks.jsonl and survive the
  324. # adapter's parsed_dir cleanup step.
  325. assert asset_dir.is_dir(), (
  326. f"asset dir not created at {asset_dir}; parsed_dir contents: "
  327. f"{list(parsed_dir.iterdir())}"
  328. )
  329. assert (asset_dir / "pic.png").read_bytes() == b"PNG-BYTES"
  330. # And drawings.json sidecar should also be there since the block
  331. # contained a <drawing .../> markup the adapter had to record.
  332. assert (parsed_dir / "with_pics.drawings.json").is_file()
  333. asyncio.run(_run())