regen_native_docx_golden.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. #!/usr/bin/env python
  2. """Regenerate the byte-equivalence golden fixtures for the native docx
  3. sidecar pipeline.
  4. The fixtures live at
  5. ``tests/parser/docx/golden/native_docx/<scenario>/``
  6. and capture the exact on-disk artifacts ``LightRAG.parse_native`` produces
  7. for each scenario in ``tests/parser/docx/_native_docx_fixtures.py``.
  8. Usage::
  9. python scripts/regen_native_docx_golden.py
  10. """
  11. from __future__ import annotations
  12. import asyncio
  13. import shutil
  14. import sys
  15. import tempfile
  16. from pathlib import Path
  17. from unittest import mock
  18. PROJECT_ROOT = Path(__file__).resolve().parent.parent
  19. sys.path.insert(0, str(PROJECT_ROOT))
  20. sys.path.insert(0, str(PROJECT_ROOT / "tests" / "parser" / "docx"))
  21. async def _regen() -> None:
  22. from lightrag.constants import (
  23. FULL_DOCS_FORMAT_PENDING_PARSE,
  24. PARSED_DIR_NAME,
  25. )
  26. from lightrag.parser.debug import (
  27. FrozenDateTime,
  28. build_debug_rag,
  29. )
  30. import lightrag.pipeline as pipeline_module
  31. from _native_docx_fixtures import SCENARIOS # type: ignore[import]
  32. fixtures_root = (
  33. PROJECT_ROOT / "tests" / "parser" / "docx" / "golden" / "native_docx"
  34. )
  35. fixtures_root.mkdir(parents=True, exist_ok=True)
  36. async def _noop_archive(_p: str) -> None:
  37. return None
  38. for scenario in SCENARIOS:
  39. scenario_dir = fixtures_root / scenario.name
  40. if scenario_dir.exists():
  41. shutil.rmtree(scenario_dir)
  42. scenario_dir.mkdir(parents=True, exist_ok=True)
  43. def _stub_extract(
  44. file_path,
  45. *,
  46. fixlevel=None,
  47. drawing_context=None,
  48. parse_warnings=None,
  49. parse_metadata=None,
  50. **_kwargs,
  51. ):
  52. if drawing_context is not None and scenario.assets:
  53. drawing_context.export_dir_path.mkdir(parents=True, exist_ok=True)
  54. for name, data in scenario.assets.items():
  55. (drawing_context.export_dir_path / name).write_bytes(data)
  56. if parse_metadata is not None:
  57. parse_metadata.update(scenario.parse_metadata)
  58. return [dict(b) for b in scenario.blocks]
  59. with tempfile.TemporaryDirectory(prefix="regen_") as tmp_root:
  60. input_dir = Path(tmp_root) / "inputs"
  61. input_dir.mkdir()
  62. source_path = input_dir / scenario.file_path
  63. source_path.parent.mkdir(parents=True, exist_ok=True)
  64. source_path.write_bytes(b"fake-docx")
  65. rag = build_debug_rag()
  66. with (
  67. mock.patch.dict("os.environ", {"INPUT_DIR": str(input_dir)}),
  68. mock.patch(
  69. "lightrag.parser.docx.parse_document.extract_docx_blocks",
  70. _stub_extract,
  71. ),
  72. mock.patch.object(
  73. pipeline_module,
  74. "archive_docx_source_after_full_docs_sync",
  75. _noop_archive,
  76. ),
  77. mock.patch("lightrag.sidecar.writer.datetime", FrozenDateTime),
  78. ):
  79. await rag.parse_native(
  80. scenario.doc_id,
  81. str(source_path),
  82. {
  83. "parse_format": FULL_DOCS_FORMAT_PENDING_PARSE,
  84. "content": "",
  85. "source_path": str(source_path),
  86. },
  87. )
  88. produced_dir = input_dir / PARSED_DIR_NAME / f"{scenario.file_path}.parsed"
  89. for item in produced_dir.rglob("*"):
  90. rel = item.relative_to(produced_dir)
  91. target = scenario_dir / rel
  92. target.parent.mkdir(parents=True, exist_ok=True)
  93. if item.is_dir():
  94. target.mkdir(exist_ok=True)
  95. else:
  96. shutil.copyfile(item, target)
  97. print(f"[regen] {scenario.name}: wrote {scenario_dir}")
  98. def main() -> None:
  99. asyncio.run(_regen())
  100. if __name__ == "__main__":
  101. main()