| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208 |
- """Tests for ``lightrag/parser/external/docling/cache.py``.
- Covers the cache-miss conditions enumerated in the module docstring:
- - missing / malformed / wrong-engine manifest
- - source size or hash mismatch
- - engine_version / endpoint_signature env mismatch
- - options_signature env mismatch
- - critical-file size / sha256 mismatch
- - non-critical file size mismatch
- """
- from __future__ import annotations
- from pathlib import Path
- import pytest
- from lightrag.parser.external import Manifest, ManifestFile, write_manifest
- from lightrag.parser.external._common import compute_size_and_hash
- from lightrag.parser.external.docling.cache import (
- compute_options_signature,
- is_bundle_valid,
- snapshot_tunable_env,
- )
- from lightrag.parser.external.docling.client import FIXED_CONSTANTS
- @pytest.fixture(autouse=True)
- def _clear_envs(monkeypatch: pytest.MonkeyPatch) -> None:
- for name in (
- "DOCLING_DO_OCR",
- "DOCLING_FORCE_OCR",
- "DOCLING_OCR_ENGINE",
- "DOCLING_OCR_PRESET",
- "DOCLING_OCR_LANG",
- "DOCLING_DO_FORMULA_ENRICHMENT",
- "DOCLING_ENGINE_VERSION",
- "DOCLING_ENDPOINT",
- ):
- monkeypatch.delenv(name, raising=False)
- monkeypatch.setenv("DOCLING_ENDPOINT", "http://docling.test")
- @pytest.fixture
- def source_file(tmp_path: Path) -> Path:
- p = tmp_path / "src.pdf"
- p.write_bytes(b"hello pdf payload" * 64)
- return p
- def test_snapshot_tunable_env_uses_effective_defaults(
- monkeypatch: pytest.MonkeyPatch,
- ) -> None:
- unset = snapshot_tunable_env()
- monkeypatch.setenv("DOCLING_DO_OCR", "true")
- monkeypatch.setenv("DOCLING_FORCE_OCR", "true")
- monkeypatch.setenv("DOCLING_OCR_ENGINE", "auto")
- monkeypatch.setenv("DOCLING_OCR_PRESET", "auto")
- monkeypatch.setenv("DOCLING_OCR_LANG", "")
- monkeypatch.setenv("DOCLING_DO_FORMULA_ENRICHMENT", "false")
- assert snapshot_tunable_env() == unset
- def _build_valid_bundle(
- tmp_path: Path,
- source_file: Path,
- *,
- options_signature: str | None = None,
- ) -> Path:
- raw_dir = tmp_path / "src.docling_raw"
- raw_dir.mkdir()
- main_json = raw_dir / "src.json"
- main_json.write_text('{"schema_name": "DoclingDocument"}', encoding="utf-8")
- md = raw_dir / "src.md"
- md.write_text("# title", encoding="utf-8")
- src_size, src_hash = compute_size_and_hash(source_file)
- crit_size, crit_hash = compute_size_and_hash(main_json)
- sig = options_signature
- if sig is None:
- sig = compute_options_signature(
- tunable_env=snapshot_tunable_env(),
- fixed_constants=FIXED_CONSTANTS,
- )
- manifest = Manifest(
- engine="docling",
- source_content_hash=src_hash,
- source_size_bytes=src_size,
- source_filename_at_parse=source_file.name,
- critical_file=ManifestFile(path="src.json", size=crit_size, sha256=crit_hash),
- files=[ManifestFile(path="src.md", size=md.stat().st_size)],
- total_size_bytes=crit_size + md.stat().st_size,
- task_id="task-1",
- endpoint_signature="http://docling.test",
- engine_version="",
- options_signature=sig,
- extras={"fixed_constants": dict(FIXED_CONSTANTS)},
- )
- write_manifest(raw_dir, manifest)
- return raw_dir
- def test_is_bundle_valid_happy_path(tmp_path: Path, source_file: Path) -> None:
- raw = _build_valid_bundle(tmp_path, source_file)
- assert is_bundle_valid(raw, source_file) is True
- def test_is_bundle_valid_missing_dir(tmp_path: Path, source_file: Path) -> None:
- assert is_bundle_valid(tmp_path / "ghost", source_file) is False
- def test_is_bundle_valid_missing_manifest(tmp_path: Path, source_file: Path) -> None:
- raw = tmp_path / "src.docling_raw"
- raw.mkdir()
- (raw / "src.json").write_text("{}")
- assert is_bundle_valid(raw, source_file) is False
- def test_is_bundle_valid_wrong_engine(tmp_path: Path, source_file: Path) -> None:
- raw = _build_valid_bundle(tmp_path, source_file)
- manifest_path = raw / "_manifest.json"
- data = manifest_path.read_text(encoding="utf-8")
- manifest_path.write_text(data.replace('"docling"', '"mineru"'), encoding="utf-8")
- assert is_bundle_valid(raw, source_file) is False
- def test_is_bundle_valid_source_size_mismatch(
- tmp_path: Path, source_file: Path
- ) -> None:
- raw = _build_valid_bundle(tmp_path, source_file)
- source_file.write_bytes(source_file.read_bytes() + b"!")
- assert is_bundle_valid(raw, source_file) is False
- def test_is_bundle_valid_source_hash_mismatch(
- tmp_path: Path, source_file: Path
- ) -> None:
- raw = _build_valid_bundle(tmp_path, source_file)
- # Replace contents with same length but different bytes
- new = b"Y" * source_file.stat().st_size
- source_file.write_bytes(new)
- assert is_bundle_valid(raw, source_file) is False
- def test_is_bundle_valid_endpoint_change(
- tmp_path: Path, source_file: Path, monkeypatch: pytest.MonkeyPatch
- ) -> None:
- raw = _build_valid_bundle(tmp_path, source_file)
- monkeypatch.setenv("DOCLING_ENDPOINT", "http://other:5001")
- assert is_bundle_valid(raw, source_file) is False
- def test_is_bundle_valid_options_signature_change(
- tmp_path: Path, source_file: Path, monkeypatch: pytest.MonkeyPatch
- ) -> None:
- raw = _build_valid_bundle(tmp_path, source_file)
- monkeypatch.setenv("DOCLING_FORCE_OCR", "false")
- assert is_bundle_valid(raw, source_file) is False
- def test_is_bundle_valid_fixed_constants_code_change(
- tmp_path: Path, source_file: Path
- ) -> None:
- # Simulate a code-only change to one of the fixed pipeline constants
- # (e.g. image_export_mode flipped from "referenced" to "embedded"
- # between parse time and validation time). The manifest stores both
- # the stale constants and a signature computed from them; validation
- # must compare against current FIXED_CONSTANTS and miss, not against
- # the manifest's own copy (which would always match).
- stale_constants = {**FIXED_CONSTANTS, "image_export_mode": "embedded"}
- stale_signature = compute_options_signature(
- tunable_env=snapshot_tunable_env(),
- fixed_constants=stale_constants,
- )
- raw = _build_valid_bundle(tmp_path, source_file, options_signature=stale_signature)
- # Overwrite the manifest's extras to record the stale constants too —
- # this is the bug surface: if validation rehydrated from extras, it
- # would reproduce stale_signature and falsely accept the bundle.
- import json as _json
- mp = raw / "_manifest.json"
- data = _json.loads(mp.read_text(encoding="utf-8"))
- data["extras"] = {"fixed_constants": stale_constants}
- mp.write_text(_json.dumps(data), encoding="utf-8")
- assert is_bundle_valid(raw, source_file) is False
- def test_is_bundle_valid_critical_file_corrupt(
- tmp_path: Path, source_file: Path
- ) -> None:
- raw = _build_valid_bundle(tmp_path, source_file)
- # Corrupt the JSON: same length, different bytes — defeats size check,
- # so the sha256 path must catch it.
- current = (raw / "src.json").read_bytes()
- (raw / "src.json").write_bytes(b"X" * len(current))
- assert is_bundle_valid(raw, source_file) is False
- def test_is_bundle_valid_other_file_size_mismatch(
- tmp_path: Path, source_file: Path
- ) -> None:
- raw = _build_valid_bundle(tmp_path, source_file)
- (raw / "src.md").write_text("totally different content here that is longer")
- assert is_bundle_valid(raw, source_file) is False
|