test_cache.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. """Tests for ``lightrag/parser/external/docling/cache.py``.
  2. Covers the cache-miss conditions enumerated in the module docstring:
  3. - missing / malformed / wrong-engine manifest
  4. - source size or hash mismatch
  5. - engine_version / endpoint_signature env mismatch
  6. - options_signature env mismatch
  7. - critical-file size / sha256 mismatch
  8. - non-critical file size mismatch
  9. """
  10. from __future__ import annotations
  11. from pathlib import Path
  12. import pytest
  13. from lightrag.parser.external import Manifest, ManifestFile, write_manifest
  14. from lightrag.parser.external._common import compute_size_and_hash
  15. from lightrag.parser.external.docling.cache import (
  16. compute_options_signature,
  17. is_bundle_valid,
  18. snapshot_tunable_env,
  19. )
  20. from lightrag.parser.external.docling.client import FIXED_CONSTANTS
  21. @pytest.fixture(autouse=True)
  22. def _clear_envs(monkeypatch: pytest.MonkeyPatch) -> None:
  23. for name in (
  24. "DOCLING_DO_OCR",
  25. "DOCLING_FORCE_OCR",
  26. "DOCLING_OCR_ENGINE",
  27. "DOCLING_OCR_PRESET",
  28. "DOCLING_OCR_LANG",
  29. "DOCLING_DO_FORMULA_ENRICHMENT",
  30. "DOCLING_ENGINE_VERSION",
  31. "DOCLING_ENDPOINT",
  32. ):
  33. monkeypatch.delenv(name, raising=False)
  34. monkeypatch.setenv("DOCLING_ENDPOINT", "http://docling.test")
  35. @pytest.fixture
  36. def source_file(tmp_path: Path) -> Path:
  37. p = tmp_path / "src.pdf"
  38. p.write_bytes(b"hello pdf payload" * 64)
  39. return p
  40. def test_snapshot_tunable_env_uses_effective_defaults(
  41. monkeypatch: pytest.MonkeyPatch,
  42. ) -> None:
  43. unset = snapshot_tunable_env()
  44. monkeypatch.setenv("DOCLING_DO_OCR", "true")
  45. monkeypatch.setenv("DOCLING_FORCE_OCR", "true")
  46. monkeypatch.setenv("DOCLING_OCR_ENGINE", "auto")
  47. monkeypatch.setenv("DOCLING_OCR_PRESET", "auto")
  48. monkeypatch.setenv("DOCLING_OCR_LANG", "")
  49. monkeypatch.setenv("DOCLING_DO_FORMULA_ENRICHMENT", "false")
  50. assert snapshot_tunable_env() == unset
  51. def _build_valid_bundle(
  52. tmp_path: Path,
  53. source_file: Path,
  54. *,
  55. options_signature: str | None = None,
  56. ) -> Path:
  57. raw_dir = tmp_path / "src.docling_raw"
  58. raw_dir.mkdir()
  59. main_json = raw_dir / "src.json"
  60. main_json.write_text('{"schema_name": "DoclingDocument"}', encoding="utf-8")
  61. md = raw_dir / "src.md"
  62. md.write_text("# title", encoding="utf-8")
  63. src_size, src_hash = compute_size_and_hash(source_file)
  64. crit_size, crit_hash = compute_size_and_hash(main_json)
  65. sig = options_signature
  66. if sig is None:
  67. sig = compute_options_signature(
  68. tunable_env=snapshot_tunable_env(),
  69. fixed_constants=FIXED_CONSTANTS,
  70. )
  71. manifest = Manifest(
  72. engine="docling",
  73. source_content_hash=src_hash,
  74. source_size_bytes=src_size,
  75. source_filename_at_parse=source_file.name,
  76. critical_file=ManifestFile(path="src.json", size=crit_size, sha256=crit_hash),
  77. files=[ManifestFile(path="src.md", size=md.stat().st_size)],
  78. total_size_bytes=crit_size + md.stat().st_size,
  79. task_id="task-1",
  80. endpoint_signature="http://docling.test",
  81. engine_version="",
  82. options_signature=sig,
  83. extras={"fixed_constants": dict(FIXED_CONSTANTS)},
  84. )
  85. write_manifest(raw_dir, manifest)
  86. return raw_dir
  87. def test_is_bundle_valid_happy_path(tmp_path: Path, source_file: Path) -> None:
  88. raw = _build_valid_bundle(tmp_path, source_file)
  89. assert is_bundle_valid(raw, source_file) is True
  90. def test_is_bundle_valid_missing_dir(tmp_path: Path, source_file: Path) -> None:
  91. assert is_bundle_valid(tmp_path / "ghost", source_file) is False
  92. def test_is_bundle_valid_missing_manifest(tmp_path: Path, source_file: Path) -> None:
  93. raw = tmp_path / "src.docling_raw"
  94. raw.mkdir()
  95. (raw / "src.json").write_text("{}")
  96. assert is_bundle_valid(raw, source_file) is False
  97. def test_is_bundle_valid_wrong_engine(tmp_path: Path, source_file: Path) -> None:
  98. raw = _build_valid_bundle(tmp_path, source_file)
  99. manifest_path = raw / "_manifest.json"
  100. data = manifest_path.read_text(encoding="utf-8")
  101. manifest_path.write_text(data.replace('"docling"', '"mineru"'), encoding="utf-8")
  102. assert is_bundle_valid(raw, source_file) is False
  103. def test_is_bundle_valid_source_size_mismatch(
  104. tmp_path: Path, source_file: Path
  105. ) -> None:
  106. raw = _build_valid_bundle(tmp_path, source_file)
  107. source_file.write_bytes(source_file.read_bytes() + b"!")
  108. assert is_bundle_valid(raw, source_file) is False
  109. def test_is_bundle_valid_source_hash_mismatch(
  110. tmp_path: Path, source_file: Path
  111. ) -> None:
  112. raw = _build_valid_bundle(tmp_path, source_file)
  113. # Replace contents with same length but different bytes
  114. new = b"Y" * source_file.stat().st_size
  115. source_file.write_bytes(new)
  116. assert is_bundle_valid(raw, source_file) is False
  117. def test_is_bundle_valid_endpoint_change(
  118. tmp_path: Path, source_file: Path, monkeypatch: pytest.MonkeyPatch
  119. ) -> None:
  120. raw = _build_valid_bundle(tmp_path, source_file)
  121. monkeypatch.setenv("DOCLING_ENDPOINT", "http://other:5001")
  122. assert is_bundle_valid(raw, source_file) is False
  123. def test_is_bundle_valid_options_signature_change(
  124. tmp_path: Path, source_file: Path, monkeypatch: pytest.MonkeyPatch
  125. ) -> None:
  126. raw = _build_valid_bundle(tmp_path, source_file)
  127. monkeypatch.setenv("DOCLING_FORCE_OCR", "false")
  128. assert is_bundle_valid(raw, source_file) is False
  129. def test_is_bundle_valid_fixed_constants_code_change(
  130. tmp_path: Path, source_file: Path
  131. ) -> None:
  132. # Simulate a code-only change to one of the fixed pipeline constants
  133. # (e.g. image_export_mode flipped from "referenced" to "embedded"
  134. # between parse time and validation time). The manifest stores both
  135. # the stale constants and a signature computed from them; validation
  136. # must compare against current FIXED_CONSTANTS and miss, not against
  137. # the manifest's own copy (which would always match).
  138. stale_constants = {**FIXED_CONSTANTS, "image_export_mode": "embedded"}
  139. stale_signature = compute_options_signature(
  140. tunable_env=snapshot_tunable_env(),
  141. fixed_constants=stale_constants,
  142. )
  143. raw = _build_valid_bundle(tmp_path, source_file, options_signature=stale_signature)
  144. # Overwrite the manifest's extras to record the stale constants too —
  145. # this is the bug surface: if validation rehydrated from extras, it
  146. # would reproduce stale_signature and falsely accept the bundle.
  147. import json as _json
  148. mp = raw / "_manifest.json"
  149. data = _json.loads(mp.read_text(encoding="utf-8"))
  150. data["extras"] = {"fixed_constants": stale_constants}
  151. mp.write_text(_json.dumps(data), encoding="utf-8")
  152. assert is_bundle_valid(raw, source_file) is False
  153. def test_is_bundle_valid_critical_file_corrupt(
  154. tmp_path: Path, source_file: Path
  155. ) -> None:
  156. raw = _build_valid_bundle(tmp_path, source_file)
  157. # Corrupt the JSON: same length, different bytes — defeats size check,
  158. # so the sha256 path must catch it.
  159. current = (raw / "src.json").read_bytes()
  160. (raw / "src.json").write_bytes(b"X" * len(current))
  161. assert is_bundle_valid(raw, source_file) is False
  162. def test_is_bundle_valid_other_file_size_mismatch(
  163. tmp_path: Path, source_file: Path
  164. ) -> None:
  165. raw = _build_valid_bundle(tmp_path, source_file)
  166. (raw / "src.md").write_text("totally different content here that is longer")
  167. assert is_bundle_valid(raw, source_file) is False