wxcz_admin
/
lightrag-cn-git-d5efd3


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
							"""Byte-equivalence golden tests for the native docx → SidecarWriter migration.

These tests run the production code path (``LightRAG.parse_native``) on
each scenario in ``_native_docx_fixtures.SCENARIOS`` and assert that
every produced file matches the captured baseline bytes under
``tests/parser/docx/golden/native_docx/<scenario>/``.

The baseline was generated by ``scripts/regen_native_docx_golden.py``;
the script lives in-tree so the snapshots can be regenerated when
intentional changes to the format are made (e.g. spec evolution).
"""

from __future__ import annotations

import asyncio
import sys
from pathlib import Path
from unittest import mock

import pytest

# Make the sibling _native_docx_fixtures module importable as if it were a
# top-level helper. Avoids needing a package __init__ rename.
HERE = Path(__file__).resolve().parent
if str(HERE) not in sys.path:
    sys.path.insert(0, str(HERE))

from _native_docx_fixtures import SCENARIOS, Scenario  # noqa: E402
from lightrag.parser.debug import (  # noqa: E402
    FrozenDateTime,
    build_debug_rag,
)

GOLDEN_ROOT = HERE / "golden" / "native_docx"


def _run_new_path(
    scenario: Scenario, input_dir: Path, monkeypatch: pytest.MonkeyPatch
) -> Path:
    """Invoke ``LightRAG.parse_native`` on a single scenario.

    The upstream ``extract_docx_blocks`` is stubbed to feed the synthetic
    blocks and produce the matching asset files inside
    ``<base>.blocks.assets/``. ``datetime.now`` is frozen so ``parse_time``
    matches the captured baseline exactly.

    Returns the parsed directory containing the produced artifacts.
    """
    from lightrag.constants import (
        FULL_DOCS_FORMAT_PENDING_PARSE,
        PARSED_DIR_NAME,
    )

    monkeypatch.setenv("INPUT_DIR", str(input_dir))

    # parse_native archives the source after parsing; for the golden test
    # we don't want the docx moving around between scenarios.
    async def _noop_archive(_p: str) -> None:
        return None

    import lightrag.pipeline as pipeline_module

    monkeypatch.setattr(
        pipeline_module, "archive_docx_source_after_full_docs_sync", _noop_archive
    )

    source_path = input_dir / scenario.file_path
    source_path.parent.mkdir(parents=True, exist_ok=True)
    source_path.write_bytes(b"fake-docx")

    def _stub_extract(
        file_path,
        *,
        fixlevel=None,
        drawing_context=None,
        parse_warnings=None,
        parse_metadata=None,
        **_kwargs,
    ):
        if drawing_context is not None and scenario.assets:
            drawing_context.export_dir_path.mkdir(parents=True, exist_ok=True)
            for name, data in scenario.assets.items():
                (drawing_context.export_dir_path / name).write_bytes(data)
        if parse_metadata is not None:
            parse_metadata.update(scenario.parse_metadata)
        return [dict(b) for b in scenario.blocks]

    rag = build_debug_rag()

    with (
        mock.patch(
            "lightrag.parser.docx.parse_document.extract_docx_blocks",
            _stub_extract,
        ),
        mock.patch("lightrag.sidecar.writer.datetime", FrozenDateTime),
    ):

        async def _go() -> None:
            await rag.parse_native(
                scenario.doc_id,
                str(source_path),
                {
                    "parse_format": FULL_DOCS_FORMAT_PENDING_PARSE,
                    "content": "",
                },
            )

        asyncio.run(_go())

    return input_dir / PARSED_DIR_NAME / f"{scenario.file_path}.parsed"


def _read_bytes(path: Path) -> bytes:
    with path.open("rb") as fh:
        return fh.read()


@pytest.mark.offline
@pytest.mark.parametrize(
    "scenario",
    SCENARIOS,
    ids=[s.name for s in SCENARIOS],
)
def test_native_docx_migration_is_byte_equivalent(
    scenario: Scenario,
    tmp_path: Path,
    monkeypatch: pytest.MonkeyPatch,
) -> None:
    """Every file produced by parse_native must be byte-identical to the
    captured legacy baseline."""
    input_dir = tmp_path / "inputs"
    input_dir.mkdir()
    parsed_dir = _run_new_path(scenario, input_dir, monkeypatch)

    expected_root = GOLDEN_ROOT / scenario.name
    assert expected_root.is_dir(), (
        f"missing golden fixture for scenario {scenario.name!r}; "
        f"run scripts/regen_native_docx_golden.py to regenerate"
    )

    # Collect both file sets relative to their roots, then compare.
    expected_files = {
        p.relative_to(expected_root): p for p in expected_root.rglob("*") if p.is_file()
    }
    produced_files = {
        p.relative_to(parsed_dir): p for p in parsed_dir.rglob("*") if p.is_file()
    }

    extra = sorted(produced_files.keys() - expected_files.keys())
    missing = sorted(expected_files.keys() - produced_files.keys())
    assert not extra, f"unexpected produced files: {extra}"
    assert not missing, f"missing produced files (legacy had them): {missing}"

    mismatches: list[str] = []
    for rel, expected_path in expected_files.items():
        produced_path = produced_files[rel]
        if _read_bytes(produced_path) != _read_bytes(expected_path):
            mismatches.append(str(rel))
    assert (
        not mismatches
    ), f"byte mismatch in scenario {scenario.name!r} for files: {mismatches}"