test_manifest.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. """Tests for ``lightrag/parser/external/docling/manifest.py`` helpers.
  2. Targets the contract guarantees that the rest of the docling flow relies on:
  3. ``select_main_json`` must find the bundle's main JSON even when ``_manifest.json``
  4. sits alongside it, and the preferred-path lookup must take priority over the
  5. fallback glob.
  6. """
  7. from __future__ import annotations
  8. from pathlib import Path
  9. import pytest
  10. from lightrag.parser.external.docling.manifest import select_main_json
  11. def _touch(path: Path, content: str = "{}") -> None:
  12. path.parent.mkdir(parents=True, exist_ok=True)
  13. path.write_text(content, encoding="utf-8")
  14. def test_select_main_json_preferred_path_hits(tmp_path: Path) -> None:
  15. # Manifest is present (the typical post-download state), but the preferred
  16. # ``<stem>.json`` exists, so the fallback glob is not consulted at all.
  17. _touch(tmp_path / "report.json")
  18. _touch(tmp_path / "_manifest.json", '{"engine":"docling"}')
  19. assert select_main_json(tmp_path, Path("report.pdf")) == tmp_path / "report.json"
  20. def test_select_main_json_fallback_ignores_manifest(tmp_path: Path) -> None:
  21. # Defensive: when the preferred path misses (e.g. docling-serve renamed
  22. # the stem for whatever reason), the fallback glob must NOT confuse
  23. # ``_manifest.json`` for a bundle JSON. Pre-fix this case raised
  24. # "multiple .json candidates".
  25. _touch(tmp_path / "report.json")
  26. _touch(tmp_path / "_manifest.json", '{"engine":"docling"}')
  27. assert select_main_json(tmp_path, Path("other.pdf")) == tmp_path / "report.json"
  28. def test_select_main_json_raises_when_only_manifest_present(tmp_path: Path) -> None:
  29. # If the bundle JSON is genuinely missing, the manifest alone is not a
  30. # valid substitute — the helper must still raise rather than silently
  31. # returning the manifest.
  32. _touch(tmp_path / "_manifest.json", '{"engine":"docling"}')
  33. with pytest.raises(RuntimeError, match="contains no .json file"):
  34. select_main_json(tmp_path, Path("report.pdf"))
  35. def test_select_main_json_raises_on_real_ambiguity(tmp_path: Path) -> None:
  36. # Two genuine bundle JSONs is still an error; the manifest filter must
  37. # not mask multi-candidate detection.
  38. _touch(tmp_path / "report.json")
  39. _touch(tmp_path / "extra.json")
  40. _touch(tmp_path / "_manifest.json", '{"engine":"docling"}')
  41. with pytest.raises(RuntimeError, match="multiple .json candidates"):
  42. select_main_json(tmp_path, Path("other.pdf"))