| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461 |
- """``*.mineru_raw/`` cache validation tests.
- Covers every failure mode that triggers a re-download:
- - missing / malformed manifest
- - source file size mismatch (fast-path)
- - source file content_hash mismatch
- - parser options signature missing / mismatch
- - engine version / endpoint env mismatch
- - critical_file (content_list.json) size or sha256 mismatch
- - any non-critical file size mismatch
- """
- from __future__ import annotations
- import json
- from pathlib import Path
- import pytest
- from lightrag.parser.external.mineru import (
- Manifest,
- ManifestFile,
- clear_dir_contents,
- compute_size_and_hash,
- is_bundle_valid,
- raw_dir_for_parsed_dir,
- )
- from lightrag.parser.external.mineru.cache import current_mineru_options_signature
- from lightrag.parser.external.mineru.manifest import write_manifest
- # ---------------------------------------------------------------------------
- # Fixtures
- # ---------------------------------------------------------------------------
- @pytest.fixture
- def source_file(tmp_path: Path) -> Path:
- p = tmp_path / "src.pdf"
- p.write_bytes(b"Hello PDF" * 100)
- return p
- @pytest.fixture
- def fresh_bundle(tmp_path: Path, source_file: Path) -> tuple[Path, Manifest]:
- """Build a fully-valid bundle alongside ``source_file`` and return
- ``(raw_dir, manifest)``."""
- raw = tmp_path / "src.mineru_raw"
- raw.mkdir()
- content_list = raw / "content_list.json"
- content_list.write_text('[{"type":"text","text":"hi"}]', encoding="utf-8")
- images = raw / "images"
- images.mkdir()
- (images / "img1.png").write_bytes(b"PNG" * 50)
- (images / "img2.png").write_bytes(b"PNG" * 60)
- src_size, src_hash = compute_size_and_hash(source_file)
- crit_size, crit_hash = compute_size_and_hash(content_list)
- files = [
- ManifestFile(path="images/img1.png", size=(images / "img1.png").stat().st_size),
- ManifestFile(path="images/img2.png", size=(images / "img2.png").stat().st_size),
- ]
- manifest = Manifest(
- source_content_hash=src_hash,
- source_size_bytes=src_size,
- source_filename_at_parse=source_file.name,
- critical_file=ManifestFile(
- path="content_list.json", size=crit_size, sha256=crit_hash
- ),
- files=files,
- total_size_bytes=crit_size + sum(f.size for f in files),
- task_id="task-1",
- api_mode="local",
- options_signature=current_mineru_options_signature(),
- )
- write_manifest(raw, manifest)
- return raw, manifest
- # ---------------------------------------------------------------------------
- # Layout helpers
- # ---------------------------------------------------------------------------
- @pytest.mark.offline
- def test_raw_dir_naming(tmp_path: Path) -> None:
- parsed = tmp_path / "report.pdf.parsed"
- raw = raw_dir_for_parsed_dir(parsed)
- assert raw.name == "report.pdf.mineru_raw"
- assert raw.parent == parsed.parent
- # ---------------------------------------------------------------------------
- # Validation: happy path + every individual failure mode
- # ---------------------------------------------------------------------------
- @pytest.mark.offline
- def test_is_bundle_valid_happy_path(
- fresh_bundle: tuple[Path, Manifest], source_file: Path
- ) -> None:
- raw, _ = fresh_bundle
- assert is_bundle_valid(raw, source_file) is True
- @pytest.mark.offline
- def test_invalid_when_manifest_missing(
- fresh_bundle: tuple[Path, Manifest], source_file: Path
- ) -> None:
- raw, _ = fresh_bundle
- (raw / "_manifest.json").unlink()
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- def test_invalid_when_manifest_malformed(
- fresh_bundle: tuple[Path, Manifest], source_file: Path
- ) -> None:
- raw, _ = fresh_bundle
- (raw / "_manifest.json").write_text("not json")
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- def test_invalid_when_manifest_wrong_engine(
- fresh_bundle: tuple[Path, Manifest], source_file: Path
- ) -> None:
- raw, _ = fresh_bundle
- payload = json.loads((raw / "_manifest.json").read_text())
- payload["engine"] = "docling"
- (raw / "_manifest.json").write_text(json.dumps(payload))
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- def test_invalid_when_source_size_changes(
- fresh_bundle: tuple[Path, Manifest], source_file: Path
- ) -> None:
- # Append bytes to the source file — size diverges from manifest fast-path.
- raw, _ = fresh_bundle
- with source_file.open("ab") as fh:
- fh.write(b"x")
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- def test_invalid_when_source_hash_changes_but_size_same(
- fresh_bundle: tuple[Path, Manifest], source_file: Path
- ) -> None:
- """In-place rewrite that preserves byte size but mutates content. The
- fast-path passes but the full hash check catches it."""
- raw, _ = fresh_bundle
- data = source_file.read_bytes()
- # Flip first byte; keep length identical.
- mutated = bytes([data[0] ^ 0xFF]) + data[1:]
- assert len(mutated) == len(data)
- source_file.write_bytes(mutated)
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- def test_invalid_when_engine_version_mismatch(
- fresh_bundle: tuple[Path, Manifest],
- source_file: Path,
- monkeypatch: pytest.MonkeyPatch,
- ) -> None:
- raw, _ = fresh_bundle
- payload = json.loads((raw / "_manifest.json").read_text())
- payload["engine_version"] = "magic-pdf 1.5.4"
- (raw / "_manifest.json").write_text(json.dumps(payload))
- monkeypatch.setenv("MINERU_ENGINE_VERSION", "magic-pdf 1.6.0")
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- def test_engine_version_match_passes(
- fresh_bundle: tuple[Path, Manifest],
- source_file: Path,
- monkeypatch: pytest.MonkeyPatch,
- ) -> None:
- raw, _ = fresh_bundle
- payload = json.loads((raw / "_manifest.json").read_text())
- payload["engine_version"] = "magic-pdf 1.5.4"
- (raw / "_manifest.json").write_text(json.dumps(payload))
- monkeypatch.setenv("MINERU_ENGINE_VERSION", "magic-pdf 1.5.4")
- assert is_bundle_valid(raw, source_file) is True
- @pytest.mark.offline
- def test_engine_version_skip_when_either_side_blank(
- fresh_bundle: tuple[Path, Manifest],
- source_file: Path,
- monkeypatch: pytest.MonkeyPatch,
- ) -> None:
- """Blank manifest engine_version + non-blank env should NOT invalidate
- (no signal from manifest); same for the reverse."""
- raw, _ = fresh_bundle
- # Manifest engine_version is empty by default.
- monkeypatch.setenv("MINERU_ENGINE_VERSION", "anything")
- assert is_bundle_valid(raw, source_file) is True
- @pytest.mark.offline
- def test_invalid_when_api_mode_mismatch(
- fresh_bundle: tuple[Path, Manifest],
- source_file: Path,
- monkeypatch: pytest.MonkeyPatch,
- ) -> None:
- raw, _ = fresh_bundle
- payload = json.loads((raw / "_manifest.json").read_text())
- payload["api_mode"] = "local"
- (raw / "_manifest.json").write_text(json.dumps(payload))
- monkeypatch.setenv("MINERU_API_MODE", "official")
- monkeypatch.setenv("MINERU_API_TOKEN", "token")
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- def test_invalid_when_options_signature_missing(
- fresh_bundle: tuple[Path, Manifest], source_file: Path
- ) -> None:
- raw, _ = fresh_bundle
- payload = json.loads((raw / "_manifest.json").read_text())
- payload.pop("options_signature", None)
- (raw / "_manifest.json").write_text(json.dumps(payload))
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- @pytest.mark.parametrize(
- ("key", "value"),
- [
- ("MINERU_LOCAL_BACKEND", "pipeline"),
- ("MINERU_LOCAL_PARSE_METHOD", "ocr"),
- ("MINERU_LOCAL_IMAGE_ANALYSIS", "false"),
- ("MINERU_LOCAL_START_PAGE_ID", "1"),
- ],
- )
- def test_invalid_when_local_parser_options_change(
- fresh_bundle: tuple[Path, Manifest],
- source_file: Path,
- monkeypatch: pytest.MonkeyPatch,
- key: str,
- value: str,
- ) -> None:
- raw, _ = fresh_bundle
- monkeypatch.setenv("MINERU_API_MODE", "local")
- monkeypatch.setenv(key, value)
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- @pytest.mark.parametrize(
- ("key", "value"),
- [
- ("MINERU_MODEL_VERSION", "pipeline"),
- ("MINERU_IS_OCR", "true"),
- ("MINERU_PAGE_RANGES", "1-5"),
- ("MINERU_LANGUAGE", "en"),
- ("MINERU_ENABLE_TABLE", "false"),
- ("MINERU_ENABLE_FORMULA", "false"),
- ],
- )
- def test_invalid_when_official_parser_options_change(
- tmp_path: Path,
- source_file: Path,
- monkeypatch: pytest.MonkeyPatch,
- key: str,
- value: str,
- ) -> None:
- """Symmetric coverage for the official-mode partition of the signature.
- Build a bundle whose ``options_signature`` reflects the official defaults,
- sanity-check that it validates, then flip ``key`` and assert a cache miss.
- """
- monkeypatch.setenv("MINERU_API_MODE", "official")
- raw = tmp_path / "src.mineru_raw"
- raw.mkdir()
- content_list = raw / "content_list.json"
- content_list.write_text('[{"type":"text","text":"hi"}]', encoding="utf-8")
- crit_size, crit_hash = compute_size_and_hash(content_list)
- src_size, src_hash = compute_size_and_hash(source_file)
- manifest = Manifest(
- source_content_hash=src_hash,
- source_size_bytes=src_size,
- source_filename_at_parse=source_file.name,
- critical_file=ManifestFile(
- path="content_list.json", size=crit_size, sha256=crit_hash
- ),
- files=[],
- total_size_bytes=crit_size,
- task_id="task-official",
- api_mode="official",
- options_signature=current_mineru_options_signature(),
- )
- write_manifest(raw, manifest)
- assert is_bundle_valid(raw, source_file) is True
- monkeypatch.setenv(key, value)
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- def test_invalid_when_endpoint_signature_mismatch(
- fresh_bundle: tuple[Path, Manifest],
- source_file: Path,
- monkeypatch: pytest.MonkeyPatch,
- ) -> None:
- raw, _ = fresh_bundle
- payload = json.loads((raw / "_manifest.json").read_text())
- payload["api_mode"] = "local"
- payload["endpoint_signature"] = "http://old.example"
- (raw / "_manifest.json").write_text(json.dumps(payload))
- monkeypatch.setenv("MINERU_API_MODE", "local")
- monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://new.example")
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- def test_endpoint_signature_uses_mode_specific_endpoint(
- fresh_bundle: tuple[Path, Manifest],
- source_file: Path,
- monkeypatch: pytest.MonkeyPatch,
- ) -> None:
- raw, _ = fresh_bundle
- payload = json.loads((raw / "_manifest.json").read_text())
- payload["api_mode"] = "local"
- payload["endpoint_signature"] = "http://old.example"
- (raw / "_manifest.json").write_text(json.dumps(payload))
- monkeypatch.setenv("MINERU_API_MODE", "local")
- monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://new.example")
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- def test_endpoint_signature_ignores_trailing_slash(
- fresh_bundle: tuple[Path, Manifest],
- source_file: Path,
- monkeypatch: pytest.MonkeyPatch,
- ) -> None:
- raw, _ = fresh_bundle
- payload = json.loads((raw / "_manifest.json").read_text())
- payload["api_mode"] = "local"
- payload["endpoint_signature"] = "http://old.example"
- (raw / "_manifest.json").write_text(json.dumps(payload))
- monkeypatch.setenv("MINERU_API_MODE", "local")
- monkeypatch.setenv("MINERU_LOCAL_ENDPOINT", "http://old.example/")
- assert is_bundle_valid(raw, source_file) is True
- @pytest.mark.offline
- def test_invalid_when_critical_file_missing(
- fresh_bundle: tuple[Path, Manifest], source_file: Path
- ) -> None:
- raw, _ = fresh_bundle
- (raw / "content_list.json").unlink()
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- def test_invalid_when_critical_file_size_changes(
- fresh_bundle: tuple[Path, Manifest], source_file: Path
- ) -> None:
- raw, _ = fresh_bundle
- cl = raw / "content_list.json"
- cl.write_text(cl.read_text() + "/* extra */")
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- def test_invalid_when_critical_file_hash_changes(
- fresh_bundle: tuple[Path, Manifest], source_file: Path
- ) -> None:
- """Same size, different bytes. sha256 is the terminal check."""
- raw, _ = fresh_bundle
- cl = raw / "content_list.json"
- data = cl.read_text()
- mutated = data[:-1] + "X" # swap last char; size preserved
- assert len(mutated) == len(data)
- cl.write_text(mutated)
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- def test_invalid_when_aux_file_size_changes(
- fresh_bundle: tuple[Path, Manifest], source_file: Path
- ) -> None:
- raw, _ = fresh_bundle
- p = raw / "images" / "img1.png"
- p.write_bytes(p.read_bytes() + b"corruption")
- assert is_bundle_valid(raw, source_file) is False
- @pytest.mark.offline
- def test_invalid_when_aux_file_missing(
- fresh_bundle: tuple[Path, Manifest], source_file: Path
- ) -> None:
- raw, _ = fresh_bundle
- (raw / "images" / "img2.png").unlink()
- assert is_bundle_valid(raw, source_file) is False
- # ---------------------------------------------------------------------------
- # Helpers
- # ---------------------------------------------------------------------------
- @pytest.mark.offline
- def test_clear_dir_contents_preserves_directory(tmp_path: Path) -> None:
- d = tmp_path / "raw"
- d.mkdir()
- (d / "a.txt").write_text("a")
- (d / "sub").mkdir()
- (d / "sub" / "b.txt").write_text("b")
- clear_dir_contents(d)
- assert d.exists()
- assert list(d.iterdir()) == []
- @pytest.mark.offline
- def test_compute_size_and_hash_consistency(tmp_path: Path) -> None:
- """Both values describe the same byte stream."""
- p = tmp_path / "f.bin"
- payload = b"abc" * 1000
- p.write_bytes(payload)
- size, h = compute_size_and_hash(p)
- assert size == len(payload)
- assert h.startswith("sha256:") and len(h) == len("sha256:") + 64
- @pytest.mark.offline
- def test_manifest_round_trip_via_disk(tmp_path: Path) -> None:
- """Write → read recovers all fields."""
- raw = tmp_path / "rt.mineru_raw"
- raw.mkdir()
- m = Manifest(
- source_content_hash="sha256:abc",
- source_size_bytes=10,
- source_filename_at_parse="x.pdf",
- critical_file=ManifestFile(
- path="content_list.json", size=5, sha256="sha256:cl"
- ),
- files=[ManifestFile(path="images/i.png", size=3)],
- total_size_bytes=8,
- task_id="t1",
- engine_version="v",
- endpoint_signature="ep",
- options_signature="sha256:opts",
- )
- write_manifest(raw, m)
- from lightrag.parser.external.mineru.manifest import load_manifest
- loaded = load_manifest(raw)
- assert loaded is not None
- assert loaded.source_content_hash == "sha256:abc"
- assert loaded.critical_file.sha256 == "sha256:cl"
- assert [f.path for f in loaded.files] == ["images/i.png"]
- assert loaded.task_id == "t1"
- assert loaded.options_signature == "sha256:opts"
|