"""Byte-equivalence golden tests for the native docx → SidecarWriter migration. These tests run the production code path (``LightRAG.parse_native``) on each scenario in ``_native_docx_fixtures.SCENARIOS`` and assert that every produced file matches the captured baseline bytes under ``tests/parser/docx/golden/native_docx//``. The baseline was generated by ``scripts/regen_native_docx_golden.py``; the script lives in-tree so the snapshots can be regenerated when intentional changes to the format are made (e.g. spec evolution). """ from __future__ import annotations import asyncio import sys from pathlib import Path from unittest import mock import pytest # Make the sibling _native_docx_fixtures module importable as if it were a # top-level helper. Avoids needing a package __init__ rename. HERE = Path(__file__).resolve().parent if str(HERE) not in sys.path: sys.path.insert(0, str(HERE)) from _native_docx_fixtures import SCENARIOS, Scenario # noqa: E402 from lightrag.parser.debug import ( # noqa: E402 FrozenDateTime, build_debug_rag, ) GOLDEN_ROOT = HERE / "golden" / "native_docx" def _run_new_path( scenario: Scenario, input_dir: Path, monkeypatch: pytest.MonkeyPatch ) -> Path: """Invoke ``LightRAG.parse_native`` on a single scenario. The upstream ``extract_docx_blocks`` is stubbed to feed the synthetic blocks and produce the matching asset files inside ``.blocks.assets/``. ``datetime.now`` is frozen so ``parse_time`` matches the captured baseline exactly. Returns the parsed directory containing the produced artifacts. """ from lightrag.constants import ( FULL_DOCS_FORMAT_PENDING_PARSE, PARSED_DIR_NAME, ) monkeypatch.setenv("INPUT_DIR", str(input_dir)) # parse_native archives the source after parsing; for the golden test # we don't want the docx moving around between scenarios. async def _noop_archive(_p: str) -> None: return None import lightrag.pipeline as pipeline_module monkeypatch.setattr( pipeline_module, "archive_docx_source_after_full_docs_sync", _noop_archive ) source_path = input_dir / scenario.file_path source_path.parent.mkdir(parents=True, exist_ok=True) source_path.write_bytes(b"fake-docx") def _stub_extract( file_path, *, fixlevel=None, drawing_context=None, parse_warnings=None, parse_metadata=None, **_kwargs, ): if drawing_context is not None and scenario.assets: drawing_context.export_dir_path.mkdir(parents=True, exist_ok=True) for name, data in scenario.assets.items(): (drawing_context.export_dir_path / name).write_bytes(data) if parse_metadata is not None: parse_metadata.update(scenario.parse_metadata) return [dict(b) for b in scenario.blocks] rag = build_debug_rag() with ( mock.patch( "lightrag.parser.docx.parse_document.extract_docx_blocks", _stub_extract, ), mock.patch("lightrag.sidecar.writer.datetime", FrozenDateTime), ): async def _go() -> None: await rag.parse_native( scenario.doc_id, str(source_path), { "parse_format": FULL_DOCS_FORMAT_PENDING_PARSE, "content": "", }, ) asyncio.run(_go()) return input_dir / PARSED_DIR_NAME / f"{scenario.file_path}.parsed" def _read_bytes(path: Path) -> bytes: with path.open("rb") as fh: return fh.read() @pytest.mark.offline @pytest.mark.parametrize( "scenario", SCENARIOS, ids=[s.name for s in SCENARIOS], ) def test_native_docx_migration_is_byte_equivalent( scenario: Scenario, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, ) -> None: """Every file produced by parse_native must be byte-identical to the captured legacy baseline.""" input_dir = tmp_path / "inputs" input_dir.mkdir() parsed_dir = _run_new_path(scenario, input_dir, monkeypatch) expected_root = GOLDEN_ROOT / scenario.name assert expected_root.is_dir(), ( f"missing golden fixture for scenario {scenario.name!r}; " f"run scripts/regen_native_docx_golden.py to regenerate" ) # Collect both file sets relative to their roots, then compare. expected_files = { p.relative_to(expected_root): p for p in expected_root.rglob("*") if p.is_file() } produced_files = { p.relative_to(parsed_dir): p for p in parsed_dir.rglob("*") if p.is_file() } extra = sorted(produced_files.keys() - expected_files.keys()) missing = sorted(expected_files.keys() - produced_files.keys()) assert not extra, f"unexpected produced files: {extra}" assert not missing, f"missing produced files (legacy had them): {missing}" mismatches: list[str] = [] for rel, expected_path in expected_files.items(): produced_path = produced_files[rel] if _read_bytes(produced_path) != _read_bytes(expected_path): mismatches.append(str(rel)) assert ( not mismatches ), f"byte mismatch in scenario {scenario.name!r} for files: {mismatches}"