test_parser_cli.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. """Tests for the unified parser debug CLI (``lightrag/parser/cli.py``).
  2. The CLI behaviour under test is engine-agnostic: argument parsing, the
  3. flat sidecar layout (no ``__parsed__/`` middle layer), the lenient raw
  4. cache strategy (non-empty raw_dir reused without manifest checks), and
  5. the no-archive guarantee on the source file.
  6. We drive these checks via the **docling** engine path because docling's
  7. raw bundle is the easiest to construct as static fixture (a single JSON
  8. file) with zero external service or fixture-file dependency. The other
  9. two engines exercise the same CLI code path:
  10. - ``native`` would need a real ``.docx`` byte stream end-to-end (golden
  11. fixtures live under ``tests/parser/docx/golden/`` and have
  12. their own coverage via ``test_native_docx_golden.py``).
  13. - ``mineru`` would need to mock ``MinerURawClient.download_into`` on
  14. the cache-miss path, or seed a mineru raw bundle layout (more files
  15. than docling's). Cache-hit reuses the same CLI orchestration as
  16. docling, so coverage here implicitly validates mineru's CLI wiring
  17. too.
  18. """
  19. from __future__ import annotations
  20. import json
  21. from pathlib import Path
  22. from typing import Any
  23. import pytest
  24. from lightrag.parser.cli import main
  25. def _make_main_json(
  26. *,
  27. origin_filename: str = "demo.pdf",
  28. with_table: bool = False,
  29. ) -> dict[str, Any]:
  30. payload: dict[str, Any] = {
  31. "schema_name": "DoclingDocument",
  32. "version": "1.10.0",
  33. "origin": {"filename": origin_filename, "mimetype": "application/pdf"},
  34. "body": {
  35. "self_ref": "#/body",
  36. "children": [{"$ref": "#/texts/0"}, {"$ref": "#/texts/1"}],
  37. "content_layer": "body",
  38. "label": "unspecified",
  39. },
  40. "groups": [],
  41. "texts": [
  42. {
  43. "self_ref": "#/texts/0",
  44. "label": "title",
  45. "text": "Hello Title",
  46. "orig": "Hello Title",
  47. "content_layer": "body",
  48. "prov": [],
  49. },
  50. {
  51. "self_ref": "#/texts/1",
  52. "label": "text",
  53. "text": "Body line.",
  54. "orig": "Body line.",
  55. "content_layer": "body",
  56. "prov": [],
  57. },
  58. ],
  59. "pictures": [],
  60. "tables": [],
  61. "key_value_items": [],
  62. "form_items": [],
  63. }
  64. if with_table:
  65. payload["body"]["children"].append({"$ref": "#/tables/0"})
  66. payload["tables"].append(
  67. {
  68. "self_ref": "#/tables/0",
  69. "label": "table",
  70. "content_layer": "body",
  71. "data": {
  72. "num_rows": 1,
  73. "num_cols": 2,
  74. "grid": [[{"text": "A"}, {"text": "B"}]],
  75. },
  76. "prov": [],
  77. }
  78. )
  79. return payload
  80. def _seed_raw_dir(raw_dir: Path, *, with_table: bool = False) -> None:
  81. raw_dir.mkdir(parents=True, exist_ok=True)
  82. (raw_dir / "demo.json").write_text(
  83. json.dumps(_make_main_json(with_table=with_table)),
  84. encoding="utf-8",
  85. )
  86. def _read_meta(blocks_path: Path) -> dict[str, Any]:
  87. return json.loads(blocks_path.read_text(encoding="utf-8").splitlines()[0])
  88. @pytest.fixture(autouse=True)
  89. def _clean_env(monkeypatch: pytest.MonkeyPatch) -> None:
  90. for name in (
  91. "DOCLING_BBOX_ATTRIBUTES",
  92. "DOCLING_ENGINE_VERSION",
  93. "LIGHTRAG_FORCE_REPARSE_DOCLING",
  94. ):
  95. monkeypatch.delenv(name, raising=False)
  96. def test_cli_writes_sidecar_from_existing_raw_dir(tmp_path: Path) -> None:
  97. source = tmp_path / "demo.pdf"
  98. source.write_bytes(b"%PDF-1.4\n") # never read; presence is the only check
  99. _seed_raw_dir(tmp_path / "demo.pdf.docling_raw", with_table=True)
  100. rc = main([str(source), "--engine", "docling"])
  101. assert rc == 0
  102. parsed_dir = tmp_path / "demo.pdf.parsed"
  103. blocks_path = parsed_dir / "demo.blocks.jsonl"
  104. assert blocks_path.is_file()
  105. assert (parsed_dir / "demo.tables.json").is_file()
  106. meta = _read_meta(blocks_path)
  107. assert meta["parse_engine"] == "docling"
  108. assert meta["document_name"] == "demo.pdf"
  109. assert meta["table_file"] is True
  110. # Source file stays where it was — the CLI mocks the archive step.
  111. assert source.is_file()
  112. def test_cli_doc_id_default_is_stable_across_runs(tmp_path: Path) -> None:
  113. source = tmp_path / "demo.pdf"
  114. source.write_bytes(b"%PDF-1.4\n")
  115. _seed_raw_dir(tmp_path / "demo.pdf.docling_raw")
  116. blocks_path = tmp_path / "demo.pdf.parsed" / "demo.blocks.jsonl"
  117. assert main([str(source), "--engine", "docling"]) == 0
  118. first_lines = blocks_path.read_text(encoding="utf-8").splitlines()
  119. first_meta = json.loads(first_lines[0])
  120. first_block_ids = [json.loads(line)["blockid"] for line in first_lines[1:]]
  121. assert main([str(source), "--engine", "docling"]) == 0
  122. second_lines = blocks_path.read_text(encoding="utf-8").splitlines()
  123. second_meta = json.loads(second_lines[0])
  124. second_block_ids = [json.loads(line)["blockid"] for line in second_lines[1:]]
  125. assert first_meta["doc_id"].startswith("doc-")
  126. assert first_meta["doc_id"] == second_meta["doc_id"]
  127. assert first_block_ids and first_block_ids == second_block_ids
  128. def test_cli_doc_id_override(tmp_path: Path) -> None:
  129. source = tmp_path / "demo.pdf"
  130. source.write_bytes(b"%PDF-1.4\n")
  131. _seed_raw_dir(tmp_path / "demo.pdf.docling_raw", with_table=True)
  132. override = "doc-" + "a" * 32
  133. rc = main([str(source), "--engine", "docling", "--doc-id", override])
  134. assert rc == 0
  135. parsed_dir = tmp_path / "demo.pdf.parsed"
  136. meta = _read_meta(parsed_dir / "demo.blocks.jsonl")
  137. assert meta["doc_id"] == override
  138. tables = json.loads((parsed_dir / "demo.tables.json").read_text(encoding="utf-8"))[
  139. "tables"
  140. ]
  141. assert tables
  142. assert all(tid.startswith("tb-" + "a" * 32 + "-") for tid in tables)
  143. def test_cli_custom_sidecar_parent_dir(tmp_path: Path) -> None:
  144. source = tmp_path / "demo.pdf"
  145. source.write_bytes(b"%PDF-1.4\n")
  146. custom_parent = tmp_path / "elsewhere"
  147. custom_parent.mkdir()
  148. _seed_raw_dir(custom_parent / "demo.pdf.docling_raw")
  149. rc = main([str(source), "--engine", "docling", "-o", str(custom_parent)])
  150. assert rc == 0
  151. assert (custom_parent / "demo.pdf.parsed" / "demo.blocks.jsonl").is_file()
  152. # Nothing should land in the source's parent directory.
  153. assert not (tmp_path / "demo.pdf.parsed").exists()
  154. # Source file is preserved in place.
  155. assert source.is_file()
  156. def test_cli_missing_input_file_returns_error(
  157. tmp_path: Path, capsys: pytest.CaptureFixture[str]
  158. ) -> None:
  159. missing = tmp_path / "nope.pdf"
  160. rc = main([str(missing), "--engine", "docling"])
  161. assert rc == 1
  162. err = capsys.readouterr().err
  163. assert str(missing.resolve()) in err
  164. def test_cli_rejects_suffix_engine_mismatch(
  165. tmp_path: Path, capsys: pytest.CaptureFixture[str]
  166. ) -> None:
  167. # native only accepts .docx; feeding it a .pdf should fail up-front with
  168. # a clear error rather than crashing deep inside the IR builder.
  169. source = tmp_path / "demo.pdf"
  170. source.write_bytes(b"%PDF-1.4\n")
  171. rc = main([str(source), "--engine", "native"])
  172. assert rc == 1
  173. err = capsys.readouterr().err
  174. assert "native" in err
  175. assert "pdf" in err
  176. assert "docx" in err # supported suffix list mentions docx