wxcz_admin
/
lightrag-cn-git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
							"""Tests for the unified parser debug CLI (``lightrag/parser/cli.py``).

The CLI behaviour under test is engine-agnostic: argument parsing, the
flat sidecar layout (no ``__parsed__/`` middle layer), the lenient raw
cache strategy (non-empty raw_dir reused without manifest checks), and
the no-archive guarantee on the source file.

We drive these checks via the **docling** engine path because docling's
raw bundle is the easiest to construct as static fixture (a single JSON
file) with zero external service or fixture-file dependency. The other
two engines exercise the same CLI code path:

- ``native`` would need a real ``.docx`` byte stream end-to-end (golden
  fixtures live under ``tests/parser/docx/golden/`` and have
  their own coverage via ``test_native_docx_golden.py``).
- ``mineru`` would need to mock ``MinerURawClient.download_into`` on
  the cache-miss path, or seed a mineru raw bundle layout (more files
  than docling's). Cache-hit reuses the same CLI orchestration as
  docling, so coverage here implicitly validates mineru's CLI wiring
  too.
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any

import pytest

from lightrag.parser.cli import main


def _make_main_json(
    *,
    origin_filename: str = "demo.pdf",
    with_table: bool = False,
) -> dict[str, Any]:
    payload: dict[str, Any] = {
        "schema_name": "DoclingDocument",
        "version": "1.10.0",
        "origin": {"filename": origin_filename, "mimetype": "application/pdf"},
        "body": {
            "self_ref": "#/body",
            "children": [{"$ref": "#/texts/0"}, {"$ref": "#/texts/1"}],
            "content_layer": "body",
            "label": "unspecified",
        },
        "groups": [],
        "texts": [
            {
                "self_ref": "#/texts/0",
                "label": "title",
                "text": "Hello Title",
                "orig": "Hello Title",
                "content_layer": "body",
                "prov": [],
            },
            {
                "self_ref": "#/texts/1",
                "label": "text",
                "text": "Body line.",
                "orig": "Body line.",
                "content_layer": "body",
                "prov": [],
            },
        ],
        "pictures": [],
        "tables": [],
        "key_value_items": [],
        "form_items": [],
    }
    if with_table:
        payload["body"]["children"].append({"$ref": "#/tables/0"})
        payload["tables"].append(
            {
                "self_ref": "#/tables/0",
                "label": "table",
                "content_layer": "body",
                "data": {
                    "num_rows": 1,
                    "num_cols": 2,
                    "grid": [[{"text": "A"}, {"text": "B"}]],
                },
                "prov": [],
            }
        )
    return payload


def _seed_raw_dir(raw_dir: Path, *, with_table: bool = False) -> None:
    raw_dir.mkdir(parents=True, exist_ok=True)
    (raw_dir / "demo.json").write_text(
        json.dumps(_make_main_json(with_table=with_table)),
        encoding="utf-8",
    )


def _read_meta(blocks_path: Path) -> dict[str, Any]:
    return json.loads(blocks_path.read_text(encoding="utf-8").splitlines()[0])


@pytest.fixture(autouse=True)
def _clean_env(monkeypatch: pytest.MonkeyPatch) -> None:
    for name in (
        "DOCLING_BBOX_ATTRIBUTES",
        "DOCLING_ENGINE_VERSION",
        "LIGHTRAG_FORCE_REPARSE_DOCLING",
    ):
        monkeypatch.delenv(name, raising=False)


def test_cli_writes_sidecar_from_existing_raw_dir(tmp_path: Path) -> None:
    source = tmp_path / "demo.pdf"
    source.write_bytes(b"%PDF-1.4\n")  # never read; presence is the only check
    _seed_raw_dir(tmp_path / "demo.pdf.docling_raw", with_table=True)

    rc = main([str(source), "--engine", "docling"])
    assert rc == 0

    parsed_dir = tmp_path / "demo.pdf.parsed"
    blocks_path = parsed_dir / "demo.blocks.jsonl"
    assert blocks_path.is_file()
    assert (parsed_dir / "demo.tables.json").is_file()

    meta = _read_meta(blocks_path)
    assert meta["parse_engine"] == "docling"
    assert meta["document_name"] == "demo.pdf"
    assert meta["table_file"] is True
    # Source file stays where it was — the CLI mocks the archive step.
    assert source.is_file()


def test_cli_doc_id_default_is_stable_across_runs(tmp_path: Path) -> None:
    source = tmp_path / "demo.pdf"
    source.write_bytes(b"%PDF-1.4\n")
    _seed_raw_dir(tmp_path / "demo.pdf.docling_raw")

    blocks_path = tmp_path / "demo.pdf.parsed" / "demo.blocks.jsonl"

    assert main([str(source), "--engine", "docling"]) == 0
    first_lines = blocks_path.read_text(encoding="utf-8").splitlines()
    first_meta = json.loads(first_lines[0])
    first_block_ids = [json.loads(line)["blockid"] for line in first_lines[1:]]

    assert main([str(source), "--engine", "docling"]) == 0
    second_lines = blocks_path.read_text(encoding="utf-8").splitlines()
    second_meta = json.loads(second_lines[0])
    second_block_ids = [json.loads(line)["blockid"] for line in second_lines[1:]]

    assert first_meta["doc_id"].startswith("doc-")
    assert first_meta["doc_id"] == second_meta["doc_id"]
    assert first_block_ids and first_block_ids == second_block_ids


def test_cli_doc_id_override(tmp_path: Path) -> None:
    source = tmp_path / "demo.pdf"
    source.write_bytes(b"%PDF-1.4\n")
    _seed_raw_dir(tmp_path / "demo.pdf.docling_raw", with_table=True)

    override = "doc-" + "a" * 32
    rc = main([str(source), "--engine", "docling", "--doc-id", override])
    assert rc == 0

    parsed_dir = tmp_path / "demo.pdf.parsed"
    meta = _read_meta(parsed_dir / "demo.blocks.jsonl")
    assert meta["doc_id"] == override

    tables = json.loads((parsed_dir / "demo.tables.json").read_text(encoding="utf-8"))[
        "tables"
    ]
    assert tables
    assert all(tid.startswith("tb-" + "a" * 32 + "-") for tid in tables)


def test_cli_custom_sidecar_parent_dir(tmp_path: Path) -> None:
    source = tmp_path / "demo.pdf"
    source.write_bytes(b"%PDF-1.4\n")
    custom_parent = tmp_path / "elsewhere"
    custom_parent.mkdir()
    _seed_raw_dir(custom_parent / "demo.pdf.docling_raw")

    rc = main([str(source), "--engine", "docling", "-o", str(custom_parent)])
    assert rc == 0
    assert (custom_parent / "demo.pdf.parsed" / "demo.blocks.jsonl").is_file()
    # Nothing should land in the source's parent directory.
    assert not (tmp_path / "demo.pdf.parsed").exists()
    # Source file is preserved in place.
    assert source.is_file()


def test_cli_missing_input_file_returns_error(
    tmp_path: Path, capsys: pytest.CaptureFixture[str]
) -> None:
    missing = tmp_path / "nope.pdf"
    rc = main([str(missing), "--engine", "docling"])
    assert rc == 1
    err = capsys.readouterr().err
    assert str(missing.resolve()) in err


def test_cli_rejects_suffix_engine_mismatch(
    tmp_path: Path, capsys: pytest.CaptureFixture[str]
) -> None:
    # native only accepts .docx; feeding it a .pdf should fail up-front with
    # a clear error rather than crashing deep inside the IR builder.
    source = tmp_path / "demo.pdf"
    source.write_bytes(b"%PDF-1.4\n")

    rc = main([str(source), "--engine", "native"])
    assert rc == 1
    err = capsys.readouterr().err
    assert "native" in err
    assert "pdf" in err
    assert "docx" in err  # supported suffix list mentions docx