test_native_docx_golden.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. """Byte-equivalence golden tests for the native docx → SidecarWriter migration.
  2. These tests run the production code path (``LightRAG.parse_native``) on
  3. each scenario in ``_native_docx_fixtures.SCENARIOS`` and assert that
  4. every produced file matches the captured baseline bytes under
  5. ``tests/parser/docx/golden/native_docx/<scenario>/``.
  6. The baseline was generated by ``scripts/regen_native_docx_golden.py``;
  7. the script lives in-tree so the snapshots can be regenerated when
  8. intentional changes to the format are made (e.g. spec evolution).
  9. """
  10. from __future__ import annotations
  11. import asyncio
  12. import sys
  13. from pathlib import Path
  14. from unittest import mock
  15. import pytest
  16. # Make the sibling _native_docx_fixtures module importable as if it were a
  17. # top-level helper. Avoids needing a package __init__ rename.
  18. HERE = Path(__file__).resolve().parent
  19. if str(HERE) not in sys.path:
  20. sys.path.insert(0, str(HERE))
  21. from _native_docx_fixtures import SCENARIOS, Scenario # noqa: E402
  22. from lightrag.parser.debug import ( # noqa: E402
  23. FrozenDateTime,
  24. build_debug_rag,
  25. )
  26. GOLDEN_ROOT = HERE / "golden" / "native_docx"
  27. def _run_new_path(
  28. scenario: Scenario, input_dir: Path, monkeypatch: pytest.MonkeyPatch
  29. ) -> Path:
  30. """Invoke ``LightRAG.parse_native`` on a single scenario.
  31. The upstream ``extract_docx_blocks`` is stubbed to feed the synthetic
  32. blocks and produce the matching asset files inside
  33. ``<base>.blocks.assets/``. ``datetime.now`` is frozen so ``parse_time``
  34. matches the captured baseline exactly.
  35. Returns the parsed directory containing the produced artifacts.
  36. """
  37. from lightrag.constants import (
  38. FULL_DOCS_FORMAT_PENDING_PARSE,
  39. PARSED_DIR_NAME,
  40. )
  41. monkeypatch.setenv("INPUT_DIR", str(input_dir))
  42. # parse_native archives the source after parsing; for the golden test
  43. # we don't want the docx moving around between scenarios.
  44. async def _noop_archive(_p: str) -> None:
  45. return None
  46. import lightrag.pipeline as pipeline_module
  47. monkeypatch.setattr(
  48. pipeline_module, "archive_docx_source_after_full_docs_sync", _noop_archive
  49. )
  50. source_path = input_dir / scenario.file_path
  51. source_path.parent.mkdir(parents=True, exist_ok=True)
  52. source_path.write_bytes(b"fake-docx")
  53. def _stub_extract(
  54. file_path,
  55. *,
  56. fixlevel=None,
  57. drawing_context=None,
  58. parse_warnings=None,
  59. parse_metadata=None,
  60. **_kwargs,
  61. ):
  62. if drawing_context is not None and scenario.assets:
  63. drawing_context.export_dir_path.mkdir(parents=True, exist_ok=True)
  64. for name, data in scenario.assets.items():
  65. (drawing_context.export_dir_path / name).write_bytes(data)
  66. if parse_metadata is not None:
  67. parse_metadata.update(scenario.parse_metadata)
  68. return [dict(b) for b in scenario.blocks]
  69. rag = build_debug_rag()
  70. with (
  71. mock.patch(
  72. "lightrag.parser.docx.parse_document.extract_docx_blocks",
  73. _stub_extract,
  74. ),
  75. mock.patch("lightrag.sidecar.writer.datetime", FrozenDateTime),
  76. ):
  77. async def _go() -> None:
  78. await rag.parse_native(
  79. scenario.doc_id,
  80. str(source_path),
  81. {
  82. "parse_format": FULL_DOCS_FORMAT_PENDING_PARSE,
  83. "content": "",
  84. },
  85. )
  86. asyncio.run(_go())
  87. return input_dir / PARSED_DIR_NAME / f"{scenario.file_path}.parsed"
  88. def _read_bytes(path: Path) -> bytes:
  89. with path.open("rb") as fh:
  90. return fh.read()
  91. @pytest.mark.offline
  92. @pytest.mark.parametrize(
  93. "scenario",
  94. SCENARIOS,
  95. ids=[s.name for s in SCENARIOS],
  96. )
  97. def test_native_docx_migration_is_byte_equivalent(
  98. scenario: Scenario,
  99. tmp_path: Path,
  100. monkeypatch: pytest.MonkeyPatch,
  101. ) -> None:
  102. """Every file produced by parse_native must be byte-identical to the
  103. captured legacy baseline."""
  104. input_dir = tmp_path / "inputs"
  105. input_dir.mkdir()
  106. parsed_dir = _run_new_path(scenario, input_dir, monkeypatch)
  107. expected_root = GOLDEN_ROOT / scenario.name
  108. assert expected_root.is_dir(), (
  109. f"missing golden fixture for scenario {scenario.name!r}; "
  110. f"run scripts/regen_native_docx_golden.py to regenerate"
  111. )
  112. # Collect both file sets relative to their roots, then compare.
  113. expected_files = {
  114. p.relative_to(expected_root): p for p in expected_root.rglob("*") if p.is_file()
  115. }
  116. produced_files = {
  117. p.relative_to(parsed_dir): p for p in parsed_dir.rglob("*") if p.is_file()
  118. }
  119. extra = sorted(produced_files.keys() - expected_files.keys())
  120. missing = sorted(expected_files.keys() - produced_files.keys())
  121. assert not extra, f"unexpected produced files: {extra}"
  122. assert not missing, f"missing produced files (legacy had them): {missing}"
  123. mismatches: list[str] = []
  124. for rel, expected_path in expected_files.items():
  125. produced_path = produced_files[rel]
  126. if _read_bytes(produced_path) != _read_bytes(expected_path):
  127. mismatches.append(str(rel))
  128. assert (
  129. not mismatches
  130. ), f"byte mismatch in scenario {scenario.name!r} for files: {mismatches}"