| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636 |
- """Integration tests for ``parse_docling`` with the unified sidecar pipeline.
- Stubs :class:`DoclingRawClient.download_into` so no real docling-serve is
- contacted; the focus is on:
- - happy path: cache miss → fake bundle written → sidecar emitted with all
- expected files at the spec-compliant locations
- - cache hit: a pre-existing valid ``*.docling_raw/`` + manifest causes
- ``DoclingRawClient.download_into`` NOT to be called
- - ``LIGHTRAG_FORCE_REPARSE_DOCLING=true`` forces a re-download even when
- the manifest is valid
- - source content swap → cache miss
- - options_signature change (``DOCLING_OCR_LANG`` toggle) → cache miss
- - adapter sees zero blocks → parse fails loudly (no half-baked sidecar)
- """
- from __future__ import annotations
- import asyncio
- import json
- from datetime import datetime, timezone
- from pathlib import Path
- from typing import Any
- import numpy as np
- import pytest
- from lightrag import LightRAG
- from lightrag.constants import FULL_DOCS_FORMAT_LIGHTRAG
- from lightrag.parser.external import (
- Manifest,
- ManifestFile,
- compute_size_and_hash,
- write_manifest,
- )
- from lightrag.parser.external.docling.cache import (
- compute_options_signature,
- snapshot_tunable_env,
- )
- from lightrag.parser.external.docling.client import FIXED_CONSTANTS
- from lightrag.utils import EmbeddingFunc, Tokenizer
- class _SimpleTokenizerImpl:
- def encode(self, content: str) -> list[int]:
- return [ord(ch) for ch in content]
- def decode(self, tokens: list[int]) -> str:
- return "".join(chr(t) for t in tokens)
- async def _mock_embedding(texts: list[str]) -> np.ndarray:
- return np.random.rand(len(texts), 32)
- async def _mock_llm(prompt: Any, **kwargs: Any) -> str:
- return '{"name":"x","summary":"s","detail_description":"d"}'
- def _new_rag(tmp_path: Path) -> LightRAG:
- return LightRAG(
- working_dir=str(tmp_path),
- workspace=f"test-docling-sidecar-{tmp_path.name}",
- llm_model_func=_mock_llm,
- embedding_func=EmbeddingFunc(
- embedding_dim=32,
- max_token_size=4096,
- func=_mock_embedding,
- ),
- tokenizer=Tokenizer("mock-tokenizer", _SimpleTokenizerImpl()),
- vlm_process_enable=False,
- )
- _FAKE_DOCLING_JSON = {
- "schema_name": "DoclingDocument",
- "version": "1.10.0",
- "origin": {"filename": "demo.pdf", "mimetype": "application/pdf"},
- "body": {
- "self_ref": "#/body",
- "children": [
- {"$ref": "#/texts/0"},
- ],
- "content_layer": "body",
- "label": "unspecified",
- },
- "groups": [],
- "texts": [
- {
- "self_ref": "#/texts/0",
- "label": "section_header",
- "text": "Intro",
- "orig": "Intro",
- "level": 1,
- "content_layer": "body",
- "children": [
- {"$ref": "#/texts/1"},
- {"$ref": "#/tables/0"},
- {"$ref": "#/pictures/0"},
- {"$ref": "#/texts/2"},
- ],
- "prov": [
- {
- "page_no": 1,
- "bbox": {
- "l": 10.0,
- "t": 100.0,
- "r": 200.0,
- "b": 80.0,
- "coord_origin": "BOTTOMLEFT",
- },
- "charspan": [0, 5],
- }
- ],
- },
- {
- "self_ref": "#/texts/1",
- "label": "text",
- "text": "Body paragraph.",
- "orig": "Body paragraph.",
- "content_layer": "body",
- "prov": [
- {
- "page_no": 1,
- "bbox": {
- "l": 10.0,
- "t": 60.0,
- "r": 200.0,
- "b": 40.0,
- "coord_origin": "BOTTOMLEFT",
- },
- "charspan": [0, 15],
- }
- ],
- },
- {
- "self_ref": "#/texts/2",
- "label": "formula",
- "text": "E = mc^2",
- "orig": "E = mc^2",
- "content_layer": "body",
- "prov": [],
- },
- ],
- "tables": [
- {
- "self_ref": "#/tables/0",
- "label": "table",
- "content_layer": "body",
- "data": {
- "num_rows": 2,
- "num_cols": 2,
- "grid": [
- [{"text": "h1"}, {"text": "h2"}],
- [{"text": "a"}, {"text": "b"}],
- ],
- },
- "prov": [],
- }
- ],
- "pictures": [
- {
- "self_ref": "#/pictures/0",
- "label": "picture",
- "content_layer": "body",
- "image": {"uri": "artifacts/img_000000.png", "mimetype": "image/png"},
- "prov": [],
- }
- ],
- "key_value_items": [],
- "form_items": [],
- "pages": {"1": {"size": {"width": 612.0, "height": 792.0}, "page_no": 1}},
- }
- def _install_fake_download(monkeypatch: pytest.MonkeyPatch) -> dict[str, int]:
- """Replace ``DoclingRawClient.download_into`` with a recorder that
- writes a synthetic raw bundle and a valid manifest."""
- import lightrag.parser.external.docling.client as client_mod
- counters = {"calls": 0}
- async def _fake_download(self, raw_dir: Path, source_file_path: Path, **_kwargs):
- counters["calls"] += 1
- raw_dir.mkdir(parents=True, exist_ok=True)
- main_json = raw_dir / "demo.json"
- main_json.write_text(json.dumps(_FAKE_DOCLING_JSON), encoding="utf-8")
- (raw_dir / "demo.md").write_text("# fake md", encoding="utf-8")
- art = raw_dir / "artifacts"
- art.mkdir(exist_ok=True)
- (art / "img_000000.png").write_bytes(b"\x89PNG fake")
- src_size, src_hash = compute_size_and_hash(source_file_path)
- crit_size, crit_hash = compute_size_and_hash(main_json)
- others = [
- ManifestFile(path="demo.md", size=(raw_dir / "demo.md").stat().st_size),
- ManifestFile(
- path="artifacts/img_000000.png",
- size=(art / "img_000000.png").stat().st_size,
- ),
- ]
- options_signature = compute_options_signature(
- tunable_env=snapshot_tunable_env(),
- fixed_constants=FIXED_CONSTANTS,
- )
- manifest = Manifest(
- engine="docling",
- source_content_hash=src_hash,
- source_size_bytes=src_size,
- source_filename_at_parse=source_file_path.name,
- critical_file=ManifestFile(
- path="demo.json", size=crit_size, sha256=crit_hash
- ),
- files=others,
- total_size_bytes=crit_size + sum(f.size for f in others),
- task_id=f"fake-{counters['calls']}",
- endpoint_signature="http://docling.test",
- options_signature=options_signature,
- extras={"fixed_constants": dict(FIXED_CONSTANTS)},
- downloaded_at=datetime.now(timezone.utc).isoformat(timespec="seconds"),
- )
- write_manifest(raw_dir, manifest)
- return manifest
- monkeypatch.setattr(client_mod.DoclingRawClient, "download_into", _fake_download)
- return counters
- def _stub_pipeline(monkeypatch: pytest.MonkeyPatch, rag: LightRAG, src: Path) -> None:
- """Common pipeline-level stubs: avoid moving the source file and pin
- the file resolver to the synthetic path."""
- async def _noop_archive(_p: str) -> None:
- return None
- import lightrag.pipeline as pipeline_module
- monkeypatch.setattr(
- pipeline_module,
- "archive_docx_source_after_full_docs_sync",
- _noop_archive,
- )
- monkeypatch.setattr(rag, "_resolve_source_file_for_parser", lambda _p: str(src))
- def _seed_doc_status(rag: LightRAG, doc_id: str) -> Any:
- return rag.doc_status.upsert(
- {
- doc_id: {
- "status": "PARSING",
- "content_summary": "",
- "content_length": 0,
- "chunks_count": 0,
- "chunks_list": [],
- "created_at": "2026-05-18T00:00:00+00:00",
- "updated_at": "2026-05-18T00:00:00+00:00",
- "file_path": "demo.pdf",
- "track_id": "trk",
- "content_hash": "",
- "metadata": {},
- }
- }
- )
- # ---------------------------------------------------------------------------
- # Tests
- # ---------------------------------------------------------------------------
- @pytest.mark.offline
- def test_parse_docling_emits_compliant_sidecar(
- tmp_path: Path, monkeypatch: pytest.MonkeyPatch
- ) -> None:
- async def _run() -> None:
- monkeypatch.setenv("DOCLING_ENDPOINT", "http://docling.test")
- counters = _install_fake_download(monkeypatch)
- input_dir = tmp_path / "inputs" / "ws"
- input_dir.mkdir(parents=True)
- src = input_dir / "demo.pdf"
- src.write_bytes(b"PDFPDF" * 256)
- rag = _new_rag(tmp_path)
- await rag.initialize_storages()
- try:
- _stub_pipeline(monkeypatch, rag, src)
- doc_id = "doc-abcdef0123456789abcdef0123456789"
- await _seed_doc_status(rag, doc_id)
- parsed = await rag.parse_docling(
- doc_id=doc_id,
- file_path="demo.pdf",
- content_data={},
- )
- assert counters["calls"] == 1
- parsed_dir = Path(parsed["blocks_path"]).parent
- assert parsed["parse_format"] == FULL_DOCS_FORMAT_LIGHTRAG
- assert parsed_dir.name == "demo.pdf.parsed"
- files = {p.name for p in parsed_dir.iterdir() if p.is_file()}
- assert "demo.blocks.jsonl" in files
- assert "demo.tables.json" in files
- assert "demo.drawings.json" in files
- assert "demo.equations.json" in files
- assert (parsed_dir / "demo.blocks.assets").is_dir()
- assert (parsed_dir / "demo.blocks.assets" / "img_000000.png").is_file()
- blocks_raw = (parsed_dir / "demo.blocks.jsonl").read_text()
- lines = blocks_raw.splitlines()
- meta = json.loads(lines[0])
- rows = [json.loads(line) for line in lines[1:]]
- assert meta["parse_engine"] == "docling"
- assert meta["bbox_attributes"] == {"origin": "LEFTBOTTOM"}
- assert "max" not in meta["bbox_attributes"]
- assert "page_sizes" not in meta["bbox_attributes"]
- assert meta["table_file"] is True
- assert meta["drawing_file"] is True
- assert meta["equation_file"] is True
- # No label="title" in the fixture (matches the typical PDF case
- # where docling produces only section_headers) → doc_title falls
- # back to the document stem.
- assert meta["doc_title"] == "demo"
- contents = " ".join(row.get("content", "") for row in rows)
- assert '<table id="tb-' in contents
- assert "<drawing" in contents
- assert "<equation" in contents
- # Raw bundle preserved next to sidecar
- raw_dir = parsed_dir.parent / "demo.pdf.docling_raw"
- assert (raw_dir / "_manifest.json").is_file()
- assert (raw_dir / "demo.json").is_file()
- assert (raw_dir / "demo.md").is_file()
- assert (raw_dir / "artifacts" / "img_000000.png").is_file()
- # Drawing path correctly resolved
- drawings = json.loads((parsed_dir / "demo.drawings.json").read_text())[
- "drawings"
- ]
- (drawing_id, drawing_item) = next(iter(drawings.items()))
- assert drawing_id.startswith("im-")
- assert drawing_item["path"] == "demo.blocks.assets/img_000000.png"
- # Table self_ref propagated
- tables = json.loads((parsed_dir / "demo.tables.json").read_text())["tables"]
- (_, table_item) = next(iter(tables.items()))
- assert table_item.get("self_ref") == "#/tables/0"
- finally:
- await rag.finalize_storages()
- asyncio.new_event_loop().run_until_complete(_run())
- @pytest.mark.offline
- def test_parse_docling_cache_hit_skips_download(
- tmp_path: Path, monkeypatch: pytest.MonkeyPatch
- ) -> None:
- async def _run() -> None:
- monkeypatch.setenv("DOCLING_ENDPOINT", "http://docling.test")
- counters = _install_fake_download(monkeypatch)
- input_dir = tmp_path / "inputs" / "ws"
- input_dir.mkdir(parents=True)
- src = input_dir / "demo.pdf"
- src.write_bytes(b"PDFPDF" * 256)
- rag = _new_rag(tmp_path)
- await rag.initialize_storages()
- try:
- _stub_pipeline(monkeypatch, rag, src)
- doc_id = "doc-abcdef0123456789abcdef0123456789"
- await _seed_doc_status(rag, doc_id)
- await rag.parse_docling(
- doc_id=doc_id,
- file_path="demo.pdf",
- content_data={},
- )
- assert counters["calls"] == 1
- await rag.parse_docling(
- doc_id=doc_id,
- file_path="demo.pdf",
- content_data={},
- )
- assert counters["calls"] == 1, "cache hit must not re-download"
- monkeypatch.setenv("LIGHTRAG_FORCE_REPARSE_DOCLING", "true")
- await rag.parse_docling(
- doc_id=doc_id,
- file_path="demo.pdf",
- content_data={},
- )
- assert counters["calls"] == 2
- finally:
- await rag.finalize_storages()
- asyncio.new_event_loop().run_until_complete(_run())
- @pytest.mark.offline
- def test_parse_docling_cache_invalidates_on_source_change(
- tmp_path: Path, monkeypatch: pytest.MonkeyPatch
- ) -> None:
- async def _run() -> None:
- monkeypatch.setenv("DOCLING_ENDPOINT", "http://docling.test")
- counters = _install_fake_download(monkeypatch)
- input_dir = tmp_path / "inputs" / "ws"
- input_dir.mkdir(parents=True)
- src = input_dir / "demo.pdf"
- src.write_bytes(b"PDFPDF" * 256)
- rag = _new_rag(tmp_path)
- await rag.initialize_storages()
- try:
- _stub_pipeline(monkeypatch, rag, src)
- doc_id = "doc-abcdef0123456789abcdef0123456789"
- await _seed_doc_status(rag, doc_id)
- await rag.parse_docling(
- doc_id=doc_id,
- file_path="demo.pdf",
- content_data={},
- )
- assert counters["calls"] == 1
- data = src.read_bytes()
- src.write_bytes(b"\x00" + data[1:])
- await rag.parse_docling(
- doc_id=doc_id,
- file_path="demo.pdf",
- content_data={},
- )
- assert counters["calls"] == 2
- finally:
- await rag.finalize_storages()
- asyncio.new_event_loop().run_until_complete(_run())
- @pytest.mark.offline
- def test_parse_docling_options_signature_invalidates_cache(
- tmp_path: Path, monkeypatch: pytest.MonkeyPatch
- ) -> None:
- async def _run() -> None:
- monkeypatch.setenv("DOCLING_ENDPOINT", "http://docling.test")
- counters = _install_fake_download(monkeypatch)
- input_dir = tmp_path / "inputs" / "ws"
- input_dir.mkdir(parents=True)
- src = input_dir / "demo.pdf"
- src.write_bytes(b"PDFPDF" * 256)
- rag = _new_rag(tmp_path)
- await rag.initialize_storages()
- try:
- _stub_pipeline(monkeypatch, rag, src)
- doc_id = "doc-abcdef0123456789abcdef0123456789"
- await _seed_doc_status(rag, doc_id)
- await rag.parse_docling(
- doc_id=doc_id,
- file_path="demo.pdf",
- content_data={},
- )
- assert counters["calls"] == 1
- # Flip an env var that participates in the options signature
- monkeypatch.setenv("DOCLING_OCR_LANG", "en,zh")
- await rag.parse_docling(
- doc_id=doc_id,
- file_path="demo.pdf",
- content_data={},
- )
- assert (
- counters["calls"] == 2
- ), "DOCLING_OCR_LANG change must invalidate the bundle cache"
- finally:
- await rag.finalize_storages()
- asyncio.new_event_loop().run_until_complete(_run())
- @pytest.mark.offline
- def test_parse_docling_endpoint_signature_invalidates_cache(
- tmp_path: Path, monkeypatch: pytest.MonkeyPatch
- ) -> None:
- async def _run() -> None:
- monkeypatch.setenv("DOCLING_ENDPOINT", "http://docling.test")
- counters = _install_fake_download(monkeypatch)
- input_dir = tmp_path / "inputs" / "ws"
- input_dir.mkdir(parents=True)
- src = input_dir / "demo.pdf"
- src.write_bytes(b"PDFPDF" * 256)
- rag = _new_rag(tmp_path)
- await rag.initialize_storages()
- try:
- _stub_pipeline(monkeypatch, rag, src)
- doc_id = "doc-abcdef0123456789abcdef0123456789"
- await _seed_doc_status(rag, doc_id)
- await rag.parse_docling(
- doc_id=doc_id,
- file_path="demo.pdf",
- content_data={},
- )
- assert counters["calls"] == 1
- # Pointing at a different docling-serve instance must not silently
- # reuse a bundle that was produced by the previous one.
- monkeypatch.setenv("DOCLING_ENDPOINT", "http://docling-other.test")
- await rag.parse_docling(
- doc_id=doc_id,
- file_path="demo.pdf",
- content_data={},
- )
- assert (
- counters["calls"] == 2
- ), "DOCLING_ENDPOINT change must invalidate the bundle cache"
- finally:
- await rag.finalize_storages()
- asyncio.new_event_loop().run_until_complete(_run())
- @pytest.mark.offline
- def test_parse_docling_zero_blocks_raises(
- tmp_path: Path, monkeypatch: pytest.MonkeyPatch
- ) -> None:
- """When the docling bundle yields no body blocks (e.g. everything was
- classified as furniture/background) ``parse_docling`` must fail loudly
- so the document is marked failed — never persist a half-baked sidecar.
- """
- async def _run() -> None:
- monkeypatch.setenv("DOCLING_ENDPOINT", "http://docling.test")
- # Install a fake download that writes a valid bundle whose body has
- # no children — the adapter then produces zero IR blocks.
- import lightrag.parser.external.docling.client as client_mod
- empty_json: dict[str, Any] = {
- "schema_name": "DoclingDocument",
- "version": "1.10.0",
- "origin": {"filename": "demo.pdf", "mimetype": "application/pdf"},
- "body": {
- "self_ref": "#/body",
- "children": [],
- "content_layer": "body",
- "label": "unspecified",
- },
- "groups": [],
- "texts": [],
- "tables": [],
- "pictures": [],
- "key_value_items": [],
- "form_items": [],
- "pages": {},
- }
- async def _fake_download(
- self, raw_dir: Path, source_file_path: Path, **_kwargs
- ):
- raw_dir.mkdir(parents=True, exist_ok=True)
- main_json = raw_dir / "demo.json"
- main_json.write_text(json.dumps(empty_json), encoding="utf-8")
- (raw_dir / "demo.md").write_text("# empty", encoding="utf-8")
- src_size, src_hash = compute_size_and_hash(source_file_path)
- crit_size, crit_hash = compute_size_and_hash(main_json)
- others = [
- ManifestFile(path="demo.md", size=(raw_dir / "demo.md").stat().st_size),
- ]
- options_signature = compute_options_signature(
- tunable_env=snapshot_tunable_env(),
- fixed_constants=FIXED_CONSTANTS,
- )
- manifest = Manifest(
- engine="docling",
- source_content_hash=src_hash,
- source_size_bytes=src_size,
- source_filename_at_parse=source_file_path.name,
- critical_file=ManifestFile(
- path="demo.json", size=crit_size, sha256=crit_hash
- ),
- files=others,
- total_size_bytes=crit_size + sum(f.size for f in others),
- task_id="fake-empty",
- endpoint_signature="http://docling.test",
- options_signature=options_signature,
- extras={"fixed_constants": dict(FIXED_CONSTANTS)},
- downloaded_at=datetime.now(timezone.utc).isoformat(timespec="seconds"),
- )
- write_manifest(raw_dir, manifest)
- return manifest
- monkeypatch.setattr(
- client_mod.DoclingRawClient, "download_into", _fake_download
- )
- input_dir = tmp_path / "inputs" / "ws"
- input_dir.mkdir(parents=True)
- src = input_dir / "demo.pdf"
- src.write_bytes(b"PDFPDF" * 256)
- rag = _new_rag(tmp_path)
- await rag.initialize_storages()
- try:
- _stub_pipeline(monkeypatch, rag, src)
- doc_id = "doc-abcdef0123456789abcdef0123456789"
- await _seed_doc_status(rag, doc_id)
- with pytest.raises(ValueError, match="zero blocks"):
- await rag.parse_docling(
- doc_id=doc_id,
- file_path="demo.pdf",
- content_data={},
- )
- # Sidecar must NOT have been emitted: ``write_sidecar`` is reached
- # only after the zero-blocks check, so no ``*.blocks.jsonl`` may
- # exist anywhere under the workspace.
- blocks_files = list(tmp_path.rglob("*.blocks.jsonl"))
- assert (
- not blocks_files
- ), f"sidecar emitted despite zero-blocks failure: {blocks_files}"
- finally:
- await rag.finalize_storages()
- asyncio.new_event_loop().run_until_complete(_run())
|