| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215 |
- """Tests for the unified parser debug CLI (``lightrag/parser/cli.py``).
- The CLI behaviour under test is engine-agnostic: argument parsing, the
- flat sidecar layout (no ``__parsed__/`` middle layer), the lenient raw
- cache strategy (non-empty raw_dir reused without manifest checks), and
- the no-archive guarantee on the source file.
- We drive these checks via the **docling** engine path because docling's
- raw bundle is the easiest to construct as static fixture (a single JSON
- file) with zero external service or fixture-file dependency. The other
- two engines exercise the same CLI code path:
- - ``native`` would need a real ``.docx`` byte stream end-to-end (golden
- fixtures live under ``tests/parser/docx/golden/`` and have
- their own coverage via ``test_native_docx_golden.py``).
- - ``mineru`` would need to mock ``MinerURawClient.download_into`` on
- the cache-miss path, or seed a mineru raw bundle layout (more files
- than docling's). Cache-hit reuses the same CLI orchestration as
- docling, so coverage here implicitly validates mineru's CLI wiring
- too.
- """
- from __future__ import annotations
- import json
- from pathlib import Path
- from typing import Any
- import pytest
- from lightrag.parser.cli import main
- def _make_main_json(
- *,
- origin_filename: str = "demo.pdf",
- with_table: bool = False,
- ) -> dict[str, Any]:
- payload: dict[str, Any] = {
- "schema_name": "DoclingDocument",
- "version": "1.10.0",
- "origin": {"filename": origin_filename, "mimetype": "application/pdf"},
- "body": {
- "self_ref": "#/body",
- "children": [{"$ref": "#/texts/0"}, {"$ref": "#/texts/1"}],
- "content_layer": "body",
- "label": "unspecified",
- },
- "groups": [],
- "texts": [
- {
- "self_ref": "#/texts/0",
- "label": "title",
- "text": "Hello Title",
- "orig": "Hello Title",
- "content_layer": "body",
- "prov": [],
- },
- {
- "self_ref": "#/texts/1",
- "label": "text",
- "text": "Body line.",
- "orig": "Body line.",
- "content_layer": "body",
- "prov": [],
- },
- ],
- "pictures": [],
- "tables": [],
- "key_value_items": [],
- "form_items": [],
- }
- if with_table:
- payload["body"]["children"].append({"$ref": "#/tables/0"})
- payload["tables"].append(
- {
- "self_ref": "#/tables/0",
- "label": "table",
- "content_layer": "body",
- "data": {
- "num_rows": 1,
- "num_cols": 2,
- "grid": [[{"text": "A"}, {"text": "B"}]],
- },
- "prov": [],
- }
- )
- return payload
- def _seed_raw_dir(raw_dir: Path, *, with_table: bool = False) -> None:
- raw_dir.mkdir(parents=True, exist_ok=True)
- (raw_dir / "demo.json").write_text(
- json.dumps(_make_main_json(with_table=with_table)),
- encoding="utf-8",
- )
- def _read_meta(blocks_path: Path) -> dict[str, Any]:
- return json.loads(blocks_path.read_text(encoding="utf-8").splitlines()[0])
- @pytest.fixture(autouse=True)
- def _clean_env(monkeypatch: pytest.MonkeyPatch) -> None:
- for name in (
- "DOCLING_BBOX_ATTRIBUTES",
- "DOCLING_ENGINE_VERSION",
- "LIGHTRAG_FORCE_REPARSE_DOCLING",
- ):
- monkeypatch.delenv(name, raising=False)
- def test_cli_writes_sidecar_from_existing_raw_dir(tmp_path: Path) -> None:
- source = tmp_path / "demo.pdf"
- source.write_bytes(b"%PDF-1.4\n") # never read; presence is the only check
- _seed_raw_dir(tmp_path / "demo.pdf.docling_raw", with_table=True)
- rc = main([str(source), "--engine", "docling"])
- assert rc == 0
- parsed_dir = tmp_path / "demo.pdf.parsed"
- blocks_path = parsed_dir / "demo.blocks.jsonl"
- assert blocks_path.is_file()
- assert (parsed_dir / "demo.tables.json").is_file()
- meta = _read_meta(blocks_path)
- assert meta["parse_engine"] == "docling"
- assert meta["document_name"] == "demo.pdf"
- assert meta["table_file"] is True
- # Source file stays where it was — the CLI mocks the archive step.
- assert source.is_file()
- def test_cli_doc_id_default_is_stable_across_runs(tmp_path: Path) -> None:
- source = tmp_path / "demo.pdf"
- source.write_bytes(b"%PDF-1.4\n")
- _seed_raw_dir(tmp_path / "demo.pdf.docling_raw")
- blocks_path = tmp_path / "demo.pdf.parsed" / "demo.blocks.jsonl"
- assert main([str(source), "--engine", "docling"]) == 0
- first_lines = blocks_path.read_text(encoding="utf-8").splitlines()
- first_meta = json.loads(first_lines[0])
- first_block_ids = [json.loads(line)["blockid"] for line in first_lines[1:]]
- assert main([str(source), "--engine", "docling"]) == 0
- second_lines = blocks_path.read_text(encoding="utf-8").splitlines()
- second_meta = json.loads(second_lines[0])
- second_block_ids = [json.loads(line)["blockid"] for line in second_lines[1:]]
- assert first_meta["doc_id"].startswith("doc-")
- assert first_meta["doc_id"] == second_meta["doc_id"]
- assert first_block_ids and first_block_ids == second_block_ids
- def test_cli_doc_id_override(tmp_path: Path) -> None:
- source = tmp_path / "demo.pdf"
- source.write_bytes(b"%PDF-1.4\n")
- _seed_raw_dir(tmp_path / "demo.pdf.docling_raw", with_table=True)
- override = "doc-" + "a" * 32
- rc = main([str(source), "--engine", "docling", "--doc-id", override])
- assert rc == 0
- parsed_dir = tmp_path / "demo.pdf.parsed"
- meta = _read_meta(parsed_dir / "demo.blocks.jsonl")
- assert meta["doc_id"] == override
- tables = json.loads((parsed_dir / "demo.tables.json").read_text(encoding="utf-8"))[
- "tables"
- ]
- assert tables
- assert all(tid.startswith("tb-" + "a" * 32 + "-") for tid in tables)
- def test_cli_custom_sidecar_parent_dir(tmp_path: Path) -> None:
- source = tmp_path / "demo.pdf"
- source.write_bytes(b"%PDF-1.4\n")
- custom_parent = tmp_path / "elsewhere"
- custom_parent.mkdir()
- _seed_raw_dir(custom_parent / "demo.pdf.docling_raw")
- rc = main([str(source), "--engine", "docling", "-o", str(custom_parent)])
- assert rc == 0
- assert (custom_parent / "demo.pdf.parsed" / "demo.blocks.jsonl").is_file()
- # Nothing should land in the source's parent directory.
- assert not (tmp_path / "demo.pdf.parsed").exists()
- # Source file is preserved in place.
- assert source.is_file()
- def test_cli_missing_input_file_returns_error(
- tmp_path: Path, capsys: pytest.CaptureFixture[str]
- ) -> None:
- missing = tmp_path / "nope.pdf"
- rc = main([str(missing), "--engine", "docling"])
- assert rc == 1
- err = capsys.readouterr().err
- assert str(missing.resolve()) in err
- def test_cli_rejects_suffix_engine_mismatch(
- tmp_path: Path, capsys: pytest.CaptureFixture[str]
- ) -> None:
- # native only accepts .docx; feeding it a .pdf should fail up-front with
- # a clear error rather than crashing deep inside the IR builder.
- source = tmp_path / "demo.pdf"
- source.write_bytes(b"%PDF-1.4\n")
- rc = main([str(source), "--engine", "native"])
- assert rc == 1
- err = capsys.readouterr().err
- assert "native" in err
- assert "pdf" in err
- assert "docx" in err # supported suffix list mentions docx
|