test_chunking_raw_lightrag_parity.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702
  1. """F-chunking parity between raw and lightrag formats.
  2. After the F-chunking unification, ``apipeline_process_enqueue_documents``
  3. strips the ``{{LRdoc}}`` marker from lightrag-format content and feeds the
  4. result into the same ``chunking_func`` used by raw documents. These tests
  5. guard the contract end-to-end:
  6. * T1: identical input text produces identical chunking inputs whether it
  7. arrives as raw or as a lightrag ``.blocks.jsonl``.
  8. * T2: ``full_docs.content`` for lightrag carries the *full* merged text
  9. with the ``{{LRdoc}}`` marker, while ``doc_status`` reports the bare
  10. body length / summary (no marker leakage).
  11. * T3: ``extraction_meta["parse_format"]`` (surfaced via
  12. ``doc_status.metadata``) is now ``"lightrag"`` for lightrag docs —
  13. previously a structured-parse fallback always tagged ``raw`` and
  14. silently mislabelled the persisted record.
  15. * T4: a raw document whose body coincidentally *looks* like structured
  16. JSONL is still tokenised as plain text — guards against re-introducing
  17. dropped structured-format detection in the raw path.
  18. * T5: ``process_options`` selecting R/V/P logs the deferred-strategy
  19. warning and falls back to fixed-token chunking.
  20. * T6: a ``pending_parse`` document that resolves to lightrag at parse
  21. time ends up with a real ``content_summary`` after PROCESSED — the
  22. ANALYZING transition refreshes the summary from the parsed body so
  23. pending-parse rows no longer carry the empty enqueue-time placeholder
  24. through to the user-facing list APIs.
  25. * T7: a raw document whose body *literally* starts with ``{{LRdoc}}``
  26. is chunked verbatim — guards against accidental re-introduction of an
  27. unconditional ``strip_lightrag_doc_prefix`` at the chunking boundary
  28. (which would silently drop the user's first 9 characters).
  29. """
  30. import asyncio
  31. import json
  32. import logging
  33. from pathlib import Path
  34. import numpy as np
  35. import pytest
  36. from lightrag import LightRAG, ROLES, RoleLLMConfig
  37. from lightrag.constants import (
  38. FULL_DOCS_FORMAT_LIGHTRAG,
  39. FULL_DOCS_FORMAT_PENDING_PARSE,
  40. LIGHTRAG_DOC_CONTENT_PREFIX,
  41. )
  42. from lightrag.utils import (
  43. EmbeddingFunc,
  44. Tokenizer,
  45. compute_mdhash_id,
  46. get_content_summary,
  47. )
  48. # ---------------------------------------------------------------------------
  49. # Shared fixtures (mirrors the harness used by test_pipeline_release_closure)
  50. # ---------------------------------------------------------------------------
  51. class _SimpleTokenizerImpl:
  52. """Char-level tokenizer so 1 char ≈ 1 token; keeps assertions readable."""
  53. def encode(self, content: str) -> list[int]:
  54. return [ord(ch) for ch in content]
  55. def decode(self, tokens: list[int]) -> str:
  56. return "".join(chr(t) for t in tokens)
  57. async def _mock_embedding(texts: list[str]) -> np.ndarray:
  58. return np.random.rand(len(texts), 32)
  59. async def _mock_llm(prompt, **kwargs):
  60. return '{"name":"x","summary":"s","detail_description":"d"}'
  61. _ROLE_FIELD_SUFFIXES = (
  62. ("_llm_model_func", "func"),
  63. ("_llm_model_kwargs", "kwargs"),
  64. ("_llm_model_max_async", "max_async"),
  65. ("_llm_timeout", "timeout"),
  66. )
  67. def _new_rag(tmp_path: Path, **kwargs) -> LightRAG:
  68. role_configs: dict[str, RoleLLMConfig] = {}
  69. for spec in ROLES:
  70. bucket = {}
  71. for suffix, target in _ROLE_FIELD_SUFFIXES:
  72. key = f"{spec.name}{suffix}"
  73. if key in kwargs:
  74. bucket[target] = kwargs.pop(key)
  75. if bucket:
  76. role_configs[spec.name] = RoleLLMConfig(**bucket)
  77. if role_configs:
  78. kwargs["role_llm_configs"] = role_configs
  79. return LightRAG(
  80. working_dir=str(tmp_path),
  81. workspace=f"chunking-parity-{tmp_path.name}",
  82. llm_model_func=_mock_llm,
  83. embedding_func=EmbeddingFunc(
  84. embedding_dim=32,
  85. max_token_size=4096,
  86. func=_mock_embedding,
  87. ),
  88. tokenizer=Tokenizer("mock-tokenizer", _SimpleTokenizerImpl()),
  89. **kwargs,
  90. )
  91. def _attach_chunking_spy(rag: LightRAG) -> dict:
  92. """Replace ``rag.chunking_func`` with a recording wrapper.
  93. Returns a dict whose ``input`` key receives the second positional arg
  94. (the content string) at every chunking call. The original chunker
  95. runs normally so the pipeline reaches PROCESSED.
  96. """
  97. captured: dict = {"input": None, "calls": 0}
  98. real = rag.chunking_func
  99. def _spy(tokenizer, content, *args, **kwargs):
  100. captured["input"] = content
  101. captured["calls"] += 1
  102. return real(tokenizer, content, *args, **kwargs)
  103. rag.chunking_func = _spy
  104. return captured
  105. def _write_lightrag_blocks(blocks_path: Path, body_paragraphs: list[str]) -> None:
  106. """Write a minimal valid LightRAG ``.blocks.jsonl`` with body paragraphs."""
  107. lines = [
  108. json.dumps(
  109. {
  110. "type": "meta",
  111. "format": "lightrag",
  112. "version": "1.0",
  113. "format_version": "1.0",
  114. },
  115. ensure_ascii=False,
  116. )
  117. ]
  118. for i, para in enumerate(body_paragraphs):
  119. lines.append(
  120. json.dumps(
  121. {
  122. "type": "content",
  123. "blockid": f"b{i}",
  124. "format": "plain_text",
  125. "content": para,
  126. },
  127. ensure_ascii=False,
  128. )
  129. )
  130. blocks_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
  131. # ---------------------------------------------------------------------------
  132. # T1 — parity: raw vs lightrag produce identical chunking input
  133. # ---------------------------------------------------------------------------
  134. @pytest.mark.offline
  135. def test_chunking_input_parity_raw_vs_lightrag(tmp_path, monkeypatch):
  136. """Same body text in raw and lightrag formats must reach
  137. ``chunking_func`` with byte-identical input."""
  138. paragraphs = [
  139. "Alpha paragraph with enough words to make it look real.",
  140. "Beta paragraph extends the body so chunking has substance.",
  141. "Gamma paragraph closes the document with a few more sentences.",
  142. ]
  143. expected_merged = "\n\n".join(paragraphs)
  144. async def _run():
  145. # ---- RAW path ----
  146. rag_raw = _new_rag(tmp_path / "raw")
  147. await rag_raw.initialize_storages()
  148. spy_raw = _attach_chunking_spy(rag_raw)
  149. try:
  150. await rag_raw.apipeline_enqueue_documents(
  151. expected_merged,
  152. file_paths="parity_raw.txt",
  153. track_id="track-raw",
  154. )
  155. await rag_raw.apipeline_process_enqueue_documents()
  156. finally:
  157. await rag_raw.finalize_storages()
  158. # ---- LIGHTRAG path ----
  159. input_dir = tmp_path / "lr-input"
  160. parsed_dir = input_dir / "__parsed__"
  161. parsed_dir.mkdir(parents=True)
  162. monkeypatch.setenv("INPUT_DIR", str(input_dir))
  163. blocks_path = parsed_dir / "parity.blocks.jsonl"
  164. _write_lightrag_blocks(blocks_path, paragraphs)
  165. rag_lr = _new_rag(tmp_path / "lr")
  166. await rag_lr.initialize_storages()
  167. spy_lr = _attach_chunking_spy(rag_lr)
  168. try:
  169. await rag_lr.apipeline_enqueue_documents(
  170. "",
  171. file_paths="parity.lightrag",
  172. docs_format=FULL_DOCS_FORMAT_LIGHTRAG,
  173. lightrag_document_paths="__parsed__/parity.blocks.jsonl",
  174. track_id="track-lr",
  175. )
  176. await rag_lr.apipeline_process_enqueue_documents()
  177. finally:
  178. await rag_lr.finalize_storages()
  179. assert spy_raw["calls"] >= 1, "raw doc never reached chunking_func"
  180. assert spy_lr["calls"] >= 1, "lightrag doc never reached chunking_func"
  181. assert spy_lr["input"] == spy_raw["input"] == expected_merged, (
  182. "chunking_func received different inputs for raw vs lightrag; "
  183. f"raw={spy_raw['input']!r}\nlr={spy_lr['input']!r}"
  184. )
  185. assert not spy_lr["input"].startswith(
  186. LIGHTRAG_DOC_CONTENT_PREFIX
  187. ), "{{LRdoc}} marker leaked into chunking_func input"
  188. asyncio.run(_run())
  189. # ---------------------------------------------------------------------------
  190. # T2 — full_docs.content carries full text; doc_status reports bare body
  191. # ---------------------------------------------------------------------------
  192. @pytest.mark.offline
  193. def test_full_docs_content_carries_full_merged_text(tmp_path, monkeypatch):
  194. body = "x" * 5000 # single paragraph, 5000 chars
  195. paragraphs = [body]
  196. async def _run():
  197. input_dir = tmp_path / "input"
  198. parsed_dir = input_dir / "__parsed__"
  199. parsed_dir.mkdir(parents=True)
  200. monkeypatch.setenv("INPUT_DIR", str(input_dir))
  201. blocks_path = parsed_dir / "big.blocks.jsonl"
  202. _write_lightrag_blocks(blocks_path, paragraphs)
  203. rag = _new_rag(tmp_path / "work")
  204. await rag.initialize_storages()
  205. try:
  206. await rag.apipeline_enqueue_documents(
  207. "",
  208. file_paths="big.lightrag",
  209. docs_format=FULL_DOCS_FORMAT_LIGHTRAG,
  210. lightrag_document_paths="__parsed__/big.blocks.jsonl",
  211. track_id="track-big",
  212. )
  213. doc_id = compute_mdhash_id("big.lightrag", prefix="doc-")
  214. full_doc = await rag.full_docs.get_by_id(doc_id)
  215. assert full_doc is not None
  216. # full_docs preserves the marker AND the full merged text.
  217. assert full_doc["content"] == LIGHTRAG_DOC_CONTENT_PREFIX + body
  218. assert full_doc.get("parse_format") == FULL_DOCS_FORMAT_LIGHTRAG
  219. # doc_status reports body-length semantics (no marker leakage).
  220. status_doc = await rag.doc_status.get_by_id(doc_id)
  221. assert status_doc is not None
  222. length = (
  223. status_doc.get("content_length")
  224. if isinstance(status_doc, dict)
  225. else getattr(status_doc, "content_length", None)
  226. )
  227. summary = (
  228. status_doc.get("content_summary")
  229. if isinstance(status_doc, dict)
  230. else getattr(status_doc, "content_summary", "")
  231. )
  232. assert length == 5000, f"content_length should match body, got {length}"
  233. assert not summary.startswith(LIGHTRAG_DOC_CONTENT_PREFIX)
  234. finally:
  235. await rag.finalize_storages()
  236. asyncio.run(_run())
  237. # ---------------------------------------------------------------------------
  238. # T3 — extraction_meta.parse_format reflects persisted format (regression guard)
  239. # ---------------------------------------------------------------------------
  240. @pytest.mark.offline
  241. def test_extraction_meta_records_lightrag_parse_format(tmp_path, monkeypatch):
  242. """Before the unification, a structured-parse fallback tagged
  243. ``extraction_meta.parse_format = raw`` for lightrag docs, silently
  244. mislabelling them in ``doc_status.metadata``. Assert the tag now
  245. reflects the persisted format end-to-end."""
  246. paragraphs = ["Body paragraph for parse_format tagging test."]
  247. async def _run():
  248. input_dir = tmp_path / "input"
  249. parsed_dir = input_dir / "__parsed__"
  250. parsed_dir.mkdir(parents=True)
  251. monkeypatch.setenv("INPUT_DIR", str(input_dir))
  252. blocks_path = parsed_dir / "tag.blocks.jsonl"
  253. _write_lightrag_blocks(blocks_path, paragraphs)
  254. rag = _new_rag(tmp_path / "work")
  255. await rag.initialize_storages()
  256. try:
  257. await rag.apipeline_enqueue_documents(
  258. "",
  259. file_paths="tag.lightrag",
  260. docs_format=FULL_DOCS_FORMAT_LIGHTRAG,
  261. lightrag_document_paths="__parsed__/tag.blocks.jsonl",
  262. track_id="track-tag",
  263. )
  264. await rag.apipeline_process_enqueue_documents()
  265. doc_id = compute_mdhash_id("tag.lightrag", prefix="doc-")
  266. status_doc = await rag.doc_status.get_by_id(doc_id)
  267. assert status_doc is not None
  268. metadata = (
  269. status_doc.get("metadata")
  270. if isinstance(status_doc, dict)
  271. else getattr(status_doc, "metadata", None)
  272. )
  273. assert isinstance(
  274. metadata, dict
  275. ), f"doc_status.metadata should be a dict, got {type(metadata)!r}"
  276. assert metadata.get("parse_format") == FULL_DOCS_FORMAT_LIGHTRAG, (
  277. f"doc_status.metadata.parse_format="
  278. f"{metadata.get('parse_format')!r}; "
  279. f"expected {FULL_DOCS_FORMAT_LIGHTRAG!r} so the multimodal "
  280. f"sidecar merge path opens"
  281. )
  282. finally:
  283. await rag.finalize_storages()
  284. asyncio.run(_run())
  285. # ---------------------------------------------------------------------------
  286. # T4 — JSONL-shaped raw text is still treated as plain text
  287. # ---------------------------------------------------------------------------
  288. @pytest.mark.offline
  289. def test_jsonl_shaped_raw_text_chunks_as_plain_text(tmp_path):
  290. """A raw document whose body coincidentally resembles structured JSONL
  291. must be tokenised plainly — guarding against accidental
  292. re-introduction of removed structured-format detection."""
  293. # No trailing newline — sanitize_text_for_encoding strips trailing
  294. # whitespace on raw enqueue, and that pre-chunking cleanup is unrelated
  295. # to structured-format detection.
  296. pseudo_jsonl = (
  297. json.dumps({"type": "meta", "format_version": "1.0"})
  298. + "\n"
  299. + json.dumps(
  300. {
  301. "type": "text",
  302. "chunk_id": "c0",
  303. "chunk_order_index": 0,
  304. "content": "fake structured line",
  305. }
  306. )
  307. )
  308. async def _run():
  309. rag = _new_rag(tmp_path)
  310. await rag.initialize_storages()
  311. spy = _attach_chunking_spy(rag)
  312. try:
  313. await rag.apipeline_enqueue_documents(
  314. pseudo_jsonl,
  315. file_paths="pseudo.txt",
  316. track_id="track-pseudo",
  317. )
  318. await rag.apipeline_process_enqueue_documents()
  319. finally:
  320. await rag.finalize_storages()
  321. # The full pseudo-jsonl text reaches chunking_func; nothing parses
  322. # it as JSONL and hijacks the chunks list.
  323. assert spy["input"] == pseudo_jsonl
  324. asyncio.run(_run())
  325. # ---------------------------------------------------------------------------
  326. # T5 — R/V/P process_options trigger the deferred-strategy warning
  327. # ---------------------------------------------------------------------------
  328. class _ListHandler(logging.Handler):
  329. """Capture log records into an in-memory list.
  330. The ``lightrag`` logger has ``propagate = False`` so pytest's caplog
  331. fixture cannot intercept its records via the root logger; this handler
  332. attaches directly to the logger we care about.
  333. """
  334. def __init__(self) -> None:
  335. super().__init__()
  336. self.records: list[logging.LogRecord] = []
  337. def emit(self, record: logging.LogRecord) -> None:
  338. self.records.append(record)
  339. @pytest.mark.offline
  340. def test_explicit_R_dispatches_to_recursive_character(tmp_path, monkeypatch):
  341. """``process_options=R`` must invoke
  342. :func:`chunking_by_recursive_character` (the new file-chunker
  343. contract) rather than the legacy ``chunking_func``.
  344. Verifies the explicit-selector dispatch contract:
  345. 1. ``chunking_by_recursive_character`` runs at least once.
  346. 2. The legacy ``chunking_func`` is bypassed entirely.
  347. 3. The deprecated "R/V not yet implemented" warning no longer
  348. appears (now that R has a real implementation).
  349. """
  350. pytest.importorskip("langchain_text_splitters")
  351. import lightrag.chunker as chunker_pkg
  352. from lightrag.chunker import chunking_by_recursive_character as real_r
  353. captured = {"calls": 0}
  354. def _r_spy(*args, **kwargs):
  355. captured["calls"] += 1
  356. return real_r(*args, **kwargs)
  357. # The dispatcher does ``from lightrag.chunker import …`` inside the
  358. # function body, which re-resolves the name from the package each
  359. # call — patching the package attribute is enough to intercept it.
  360. monkeypatch.setattr(chunker_pkg, "chunking_by_recursive_character", _r_spy)
  361. async def _run():
  362. rag = _new_rag(tmp_path)
  363. await rag.initialize_storages()
  364. legacy_spy = _attach_chunking_spy(rag)
  365. lightrag_logger = logging.getLogger("lightrag")
  366. list_handler = _ListHandler()
  367. list_handler.setLevel(logging.WARNING)
  368. lightrag_logger.addHandler(list_handler)
  369. try:
  370. await rag.apipeline_enqueue_documents(
  371. "Body paragraph one.\n\nBody paragraph two for R dispatch test.",
  372. file_paths="rs.[native-R].txt",
  373. track_id="track-rs",
  374. process_options="R",
  375. )
  376. await rag.apipeline_process_enqueue_documents()
  377. finally:
  378. lightrag_logger.removeHandler(list_handler)
  379. await rag.finalize_storages()
  380. assert captured["calls"] >= 1, "R must route to chunking_by_recursive_character"
  381. assert legacy_spy["calls"] == 0, (
  382. "explicit process_options selector must bypass legacy "
  383. "chunking_func; got "
  384. f"{legacy_spy['calls']} calls"
  385. )
  386. warning_messages = [
  387. rec.getMessage()
  388. for rec in list_handler.records
  389. if rec.levelno == logging.WARNING
  390. ]
  391. assert not any(
  392. "R/V strategies are not yet implemented" in msg for msg in warning_messages
  393. ), (
  394. "deprecated 'not yet implemented' warning must be gone now "
  395. f"that R is wired up; saw: {warning_messages!r}"
  396. )
  397. asyncio.run(_run())
  398. @pytest.mark.offline
  399. def test_explicit_V_dispatches_to_semantic_vector(tmp_path, monkeypatch):
  400. """``process_options=V`` must invoke
  401. :func:`chunking_by_semantic_vector` and bypass the legacy
  402. ``chunking_func``. The test installs a stub embedding (the spy
  403. short-circuits before the real LangChain SemanticChunker runs) so
  404. the assertion is purely about dispatch routing, not chunk quality.
  405. """
  406. pytest.importorskip("langchain_experimental")
  407. import lightrag.chunker as chunker_pkg
  408. captured = {"calls": 0}
  409. async def _v_spy(*args, **kwargs):
  410. # Short-circuit: skip langchain SemanticChunker entirely and
  411. # return one synthetic chunk. We're only verifying that the
  412. # dispatcher routed here with the right keyword args.
  413. captured["calls"] += 1
  414. captured["embedding_func"] = kwargs.get("embedding_func")
  415. captured["chunk_token_size"] = args[2] if len(args) > 2 else None
  416. return [
  417. {"tokens": 5, "content": "stub", "chunk_order_index": 0},
  418. ]
  419. monkeypatch.setattr(chunker_pkg, "chunking_by_semantic_vector", _v_spy)
  420. async def _run():
  421. rag = _new_rag(tmp_path)
  422. await rag.initialize_storages()
  423. legacy_spy = _attach_chunking_spy(rag)
  424. try:
  425. await rag.apipeline_enqueue_documents(
  426. "Body for V dispatch test. Sentence one. Sentence two.",
  427. file_paths="vs.[native-V].txt",
  428. track_id="track-vs",
  429. process_options="V",
  430. )
  431. await rag.apipeline_process_enqueue_documents()
  432. finally:
  433. await rag.finalize_storages()
  434. assert captured["calls"] >= 1, "V must route to chunking_by_semantic_vector"
  435. assert (
  436. captured.get("embedding_func") is rag.embedding_func
  437. ), "dispatcher must hand the LightRAG embedding_func to the V chunker"
  438. assert legacy_spy["calls"] == 0, (
  439. "explicit process_options selector must bypass legacy " "chunking_func"
  440. )
  441. asyncio.run(_run())
  442. # ---------------------------------------------------------------------------
  443. # T6 — pending_parse → lightrag summary is populated after PROCESSED
  444. # ---------------------------------------------------------------------------
  445. @pytest.mark.offline
  446. def test_pending_parse_lightrag_summary_populated_after_processed(
  447. tmp_path, monkeypatch
  448. ):
  449. """A document enqueued as ``pending_parse`` has empty content at
  450. enqueue time, so ``content_summary`` starts empty. After
  451. ``parse_native`` produces ``.blocks.jsonl`` and the state machine
  452. moves through ANALYZING → PROCESSING → PROCESSED, the summary must
  453. reflect the parsed body — not the enqueue-time placeholder."""
  454. body_paragraphs = [
  455. "Pending-parse summary regression body paragraph one.",
  456. "Body paragraph two carries enough text for a meaningful preview.",
  457. "Body paragraph three closes the document.",
  458. ]
  459. async def _run():
  460. input_dir = tmp_path / "input"
  461. input_dir.mkdir()
  462. monkeypatch.setenv("INPUT_DIR", str(input_dir))
  463. source_path = input_dir / "summary.docx"
  464. source_path.write_bytes(b"fake docx bytes")
  465. # Stub the docx extractor so the parsed blocks are deterministic;
  466. # the adapter still writes the canonical .blocks.jsonl + sidecars.
  467. def _stub_extract(file_path, fixlevel=None, drawing_context=None, **kwargs):
  468. return [
  469. {
  470. "uuid": f"para-{i}",
  471. "uuid_end": f"para-{i}",
  472. "heading": "",
  473. "content": para,
  474. "type": "text",
  475. "parent_headings": [],
  476. "level": 0,
  477. "table_chunk_role": "none",
  478. }
  479. for i, para in enumerate(body_paragraphs)
  480. ]
  481. monkeypatch.setattr(
  482. "lightrag.parser.docx.parse_document.extract_docx_blocks",
  483. _stub_extract,
  484. )
  485. rag = _new_rag(tmp_path / "work")
  486. await rag.initialize_storages()
  487. try:
  488. await rag.apipeline_enqueue_documents(
  489. "",
  490. file_paths="summary.docx",
  491. docs_format=FULL_DOCS_FORMAT_PENDING_PARSE,
  492. track_id="track-summary",
  493. )
  494. doc_id = compute_mdhash_id("summary.docx", prefix="doc-")
  495. pending = await rag.doc_status.get_by_id(doc_id)
  496. assert pending is not None
  497. pending_summary = (
  498. pending.get("content_summary")
  499. if isinstance(pending, dict)
  500. else getattr(pending, "content_summary", "")
  501. )
  502. # At enqueue time pending_parse content is "" so summary is empty.
  503. assert pending_summary == "", (
  504. f"pending_parse should start with empty summary, got "
  505. f"{pending_summary!r}"
  506. )
  507. await rag.apipeline_process_enqueue_documents()
  508. final = await rag.doc_status.get_by_id(doc_id)
  509. assert final is not None
  510. final_summary = (
  511. final.get("content_summary")
  512. if isinstance(final, dict)
  513. else getattr(final, "content_summary", "")
  514. )
  515. final_length = (
  516. final.get("content_length")
  517. if isinstance(final, dict)
  518. else getattr(final, "content_length", 0)
  519. )
  520. assert final_summary, (
  521. "content_summary still empty after PROCESSED; ANALYZING "
  522. "refresh did not propagate"
  523. )
  524. assert not final_summary.startswith(LIGHTRAG_DOC_CONTENT_PREFIX), (
  525. f"{{LRdoc}} marker leaked into doc_status summary: "
  526. f"{final_summary!r}"
  527. )
  528. # The parser stub produces these paragraphs verbatim; the
  529. # blocks.jsonl writer joins them with a blank line, so the
  530. # summary must be a prefix of that merged text.
  531. merged_text = "\n\n".join(body_paragraphs)
  532. assert final_summary == get_content_summary(merged_text), (
  533. f"summary should match get_content_summary(merged_text); "
  534. f"got {final_summary!r} vs "
  535. f"{get_content_summary(merged_text)!r}"
  536. )
  537. assert final_length == len(merged_text), (
  538. f"content_length should equal len(merged_text)={len(merged_text)}, "
  539. f"got {final_length}"
  540. )
  541. finally:
  542. await rag.finalize_storages()
  543. asyncio.run(_run())
  544. # ---------------------------------------------------------------------------
  545. # T7 — raw text starting with {{LRdoc}} must not be stripped at chunking
  546. # ---------------------------------------------------------------------------
  547. @pytest.mark.offline
  548. def test_raw_text_starting_with_marker_chunked_verbatim(tmp_path):
  549. """A raw document whose body literally begins with ``{{LRdoc}}`` is a
  550. legitimate user input — the chunking branch must not strip those 9
  551. characters. ``strip_lightrag_doc_prefix`` is a lightrag-only contract
  552. enforced by ``parse_native``; raw paths return ``content_data["content"]``
  553. verbatim, so chunking must hand the body to ``chunking_func`` unchanged."""
  554. body_with_marker = LIGHTRAG_DOC_CONTENT_PREFIX + (
  555. "literal-marker-prefix raw document body that should survive "
  556. "the chunking boundary intact."
  557. )
  558. async def _run():
  559. rag = _new_rag(tmp_path)
  560. await rag.initialize_storages()
  561. spy = _attach_chunking_spy(rag)
  562. try:
  563. await rag.apipeline_enqueue_documents(
  564. body_with_marker,
  565. file_paths="marker_raw.txt",
  566. track_id="track-marker",
  567. )
  568. await rag.apipeline_process_enqueue_documents()
  569. finally:
  570. await rag.finalize_storages()
  571. assert spy["calls"] >= 1, "raw doc never reached chunking_func"
  572. # The full body — including the literal {{LRdoc}} prefix — must
  573. # reach chunking_func; nothing in the chunking branch should
  574. # treat the marker as a stripping signal for raw content.
  575. assert spy["input"] == body_with_marker, (
  576. "chunking_func received corrupted input: "
  577. f"got {spy['input']!r}, expected {body_with_marker!r}"
  578. )
  579. assert spy["input"].startswith(
  580. LIGHTRAG_DOC_CONTENT_PREFIX
  581. ), "literal marker prefix lost at chunking boundary"
  582. asyncio.run(_run())