test_common.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. """Tests for shared helpers in ``lightrag/parser/external/``.
  2. These cover the pure functions reused across engine integrations:
  3. - ``compute_size_and_hash`` — single-read (size, hash) pair
  4. - ``clear_dir_contents`` — empty a directory while keeping it
  5. - ``raw_dir_for_parsed_dir`` — suffix-bound raw dir naming
  6. - ``safe_extract_zip`` — refuses path traversal and absolute paths
  7. - ``env_bool`` / ``env_int`` / ``env_json`` — env parsing
  8. - ``Manifest`` round-trip via ``write_manifest`` / ``load_manifest``
  9. """
  10. from __future__ import annotations
  11. import io
  12. import json
  13. import zipfile
  14. from pathlib import Path
  15. import pytest
  16. from lightrag.parser.external import (
  17. Manifest,
  18. ManifestFile,
  19. clear_dir_contents,
  20. compute_size_and_hash,
  21. env_bool,
  22. env_int,
  23. env_json,
  24. load_manifest,
  25. raw_dir_for_parsed_dir,
  26. safe_extract_zip,
  27. write_manifest,
  28. )
  29. # ---------------------------------------------------------------------------
  30. # compute_size_and_hash
  31. # ---------------------------------------------------------------------------
  32. def test_compute_size_and_hash_stable(tmp_path: Path) -> None:
  33. p = tmp_path / "f.bin"
  34. payload = b"hello-external-parser" * 1024
  35. p.write_bytes(payload)
  36. size_a, hash_a = compute_size_and_hash(p)
  37. size_b, hash_b = compute_size_and_hash(p)
  38. assert size_a == len(payload) == size_b
  39. assert hash_a == hash_b
  40. assert hash_a.startswith("sha256:") and len(hash_a) == len("sha256:") + 64
  41. # ---------------------------------------------------------------------------
  42. # clear_dir_contents
  43. # ---------------------------------------------------------------------------
  44. def test_clear_dir_contents_keeps_dir_and_removes_children(tmp_path: Path) -> None:
  45. d = tmp_path / "raw"
  46. d.mkdir()
  47. (d / "a.txt").write_text("hi")
  48. sub = d / "nested"
  49. sub.mkdir()
  50. (sub / "b.bin").write_bytes(b"x" * 10)
  51. clear_dir_contents(d)
  52. assert d.is_dir()
  53. assert list(d.iterdir()) == []
  54. def test_clear_dir_contents_noop_when_missing(tmp_path: Path) -> None:
  55. clear_dir_contents(tmp_path / "does-not-exist")
  56. # ---------------------------------------------------------------------------
  57. # raw_dir_for_parsed_dir
  58. # ---------------------------------------------------------------------------
  59. def test_raw_dir_for_parsed_dir_with_suffix(tmp_path: Path) -> None:
  60. parsed = tmp_path / "demo.pdf.parsed"
  61. raw = raw_dir_for_parsed_dir(parsed, suffix=".docling_raw")
  62. assert raw == tmp_path / "demo.pdf.docling_raw"
  63. def test_raw_dir_for_parsed_dir_without_parsed_suffix(tmp_path: Path) -> None:
  64. parsed = tmp_path / "other_dir"
  65. raw = raw_dir_for_parsed_dir(parsed, suffix=".docling_raw")
  66. assert raw == tmp_path / "other_dir.docling_raw"
  67. def test_raw_dir_for_parsed_dir_rejects_bad_suffix(tmp_path: Path) -> None:
  68. with pytest.raises(ValueError):
  69. raw_dir_for_parsed_dir(tmp_path / "x.parsed", suffix="docling_raw")
  70. # ---------------------------------------------------------------------------
  71. # safe_extract_zip
  72. # ---------------------------------------------------------------------------
  73. def _make_zip(entries: dict[str, bytes]) -> bytes:
  74. buf = io.BytesIO()
  75. with zipfile.ZipFile(buf, "w") as zf:
  76. for name, payload in entries.items():
  77. zf.writestr(name, payload)
  78. return buf.getvalue()
  79. def test_safe_extract_zip_extracts_flat_bundle(tmp_path: Path) -> None:
  80. payload = _make_zip(
  81. {
  82. "demo.json": b'{"schema_name": "DoclingDocument"}',
  83. "demo.md": b"# demo",
  84. "artifacts/image_000000.png": b"\x89PNG fake",
  85. }
  86. )
  87. dest = tmp_path / "raw"
  88. names = safe_extract_zip(payload, dest)
  89. assert (dest / "demo.json").read_bytes().startswith(b'{"schema_name"')
  90. assert (dest / "demo.md").read_text(encoding="utf-8") == "# demo"
  91. assert (dest / "artifacts" / "image_000000.png").is_file()
  92. assert sorted(names) == sorted(
  93. ["demo.json", "demo.md", "artifacts/image_000000.png"]
  94. )
  95. def test_safe_extract_zip_rejects_path_traversal(tmp_path: Path) -> None:
  96. payload = _make_zip({"../evil.txt": b"oops"})
  97. with pytest.raises(RuntimeError, match="unsafe path"):
  98. safe_extract_zip(payload, tmp_path / "raw")
  99. def test_safe_extract_zip_rejects_absolute_path(tmp_path: Path) -> None:
  100. payload = _make_zip({"/etc/passwd": b"oops"})
  101. with pytest.raises(RuntimeError, match="unsafe path"):
  102. safe_extract_zip(payload, tmp_path / "raw")
  103. # ---------------------------------------------------------------------------
  104. # env coercion
  105. # ---------------------------------------------------------------------------
  106. def test_env_bool_truthy_falsy(monkeypatch: pytest.MonkeyPatch) -> None:
  107. for raw in ("1", "true", "yes", "ON"):
  108. monkeypatch.setenv("X", raw)
  109. assert env_bool("X", False) is True
  110. for raw in ("0", "false", "no", "off"):
  111. monkeypatch.setenv("X", raw)
  112. assert env_bool("X", True) is False
  113. def test_env_bool_falls_back_on_unrecognized(monkeypatch: pytest.MonkeyPatch) -> None:
  114. monkeypatch.setenv("X", "maybe")
  115. assert env_bool("X", True) is True
  116. assert env_bool("X", False) is False
  117. def test_env_int_falls_back_on_garbage(monkeypatch: pytest.MonkeyPatch) -> None:
  118. monkeypatch.setenv("X", "not-an-int")
  119. assert env_int("X", 7) == 7
  120. def test_env_json_returns_default_on_garbage(monkeypatch: pytest.MonkeyPatch) -> None:
  121. monkeypatch.setenv("X", "{bad json")
  122. assert env_json("X", {"origin": "LEFTBOTTOM"}) == {"origin": "LEFTBOTTOM"}
  123. def test_env_json_parses_object(monkeypatch: pytest.MonkeyPatch) -> None:
  124. monkeypatch.setenv("X", '{"a": 1, "b": [2, 3]}')
  125. assert env_json("X", None) == {"a": 1, "b": [2, 3]}
  126. # ---------------------------------------------------------------------------
  127. # Manifest round-trip
  128. # ---------------------------------------------------------------------------
  129. def test_manifest_round_trip(tmp_path: Path) -> None:
  130. raw = tmp_path / "demo.docling_raw"
  131. raw.mkdir()
  132. crit = ManifestFile(path="demo.json", size=42, sha256="sha256:" + "a" * 64)
  133. other = ManifestFile(path="demo.md", size=10)
  134. manifest = Manifest(
  135. engine="docling",
  136. source_content_hash="sha256:" + "b" * 64,
  137. source_size_bytes=100,
  138. source_filename_at_parse="demo.pdf",
  139. critical_file=crit,
  140. files=[other],
  141. total_size_bytes=52,
  142. task_id="task-xyz",
  143. endpoint_signature="http://l4ai:5001",
  144. engine_version="1.18.0",
  145. options_signature="sha256:" + "c" * 64,
  146. downloaded_at="2026-05-18T00:00:00Z",
  147. extras={"to_formats": ["json", "md"]},
  148. )
  149. write_manifest(raw, manifest)
  150. payload = json.loads((raw / "_manifest.json").read_text(encoding="utf-8"))
  151. assert payload["engine"] == "docling"
  152. assert payload["options_signature"] == "sha256:" + "c" * 64
  153. assert payload["extras"] == {"to_formats": ["json", "md"]}
  154. loaded = load_manifest(raw, expected_engine="docling")
  155. assert loaded is not None
  156. assert loaded.task_id == "task-xyz"
  157. assert loaded.critical_file.size == 42
  158. assert loaded.files[0].path == "demo.md"
  159. def test_manifest_load_rejects_wrong_engine(tmp_path: Path) -> None:
  160. raw = tmp_path / "demo.docling_raw"
  161. raw.mkdir()
  162. manifest = Manifest(
  163. engine="mineru",
  164. source_content_hash="sha256:" + "0" * 64,
  165. source_size_bytes=1,
  166. source_filename_at_parse="x",
  167. critical_file=ManifestFile(path="c", size=1, sha256="sha256:" + "1" * 64),
  168. files=[],
  169. total_size_bytes=1,
  170. )
  171. write_manifest(raw, manifest)
  172. assert load_manifest(raw, expected_engine="docling") is None
  173. def test_manifest_load_handles_missing_file(tmp_path: Path) -> None:
  174. assert load_manifest(tmp_path / "no-such-dir", expected_engine="docling") is None