| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122 |
- #!/usr/bin/env python
- """Regenerate the byte-equivalence golden fixtures for the native docx
- sidecar pipeline.
- The fixtures live at
- ``tests/parser/docx/golden/native_docx/<scenario>/``
- and capture the exact on-disk artifacts ``LightRAG.parse_native`` produces
- for each scenario in ``tests/parser/docx/_native_docx_fixtures.py``.
- Usage::
- python scripts/regen_native_docx_golden.py
- """
- from __future__ import annotations
- import asyncio
- import shutil
- import sys
- import tempfile
- from pathlib import Path
- from unittest import mock
- PROJECT_ROOT = Path(__file__).resolve().parent.parent
- sys.path.insert(0, str(PROJECT_ROOT))
- sys.path.insert(0, str(PROJECT_ROOT / "tests" / "parser" / "docx"))
- async def _regen() -> None:
- from lightrag.constants import (
- FULL_DOCS_FORMAT_PENDING_PARSE,
- PARSED_DIR_NAME,
- )
- from lightrag.parser.debug import (
- FrozenDateTime,
- build_debug_rag,
- )
- import lightrag.pipeline as pipeline_module
- from _native_docx_fixtures import SCENARIOS # type: ignore[import]
- fixtures_root = (
- PROJECT_ROOT / "tests" / "parser" / "docx" / "golden" / "native_docx"
- )
- fixtures_root.mkdir(parents=True, exist_ok=True)
- async def _noop_archive(_p: str) -> None:
- return None
- for scenario in SCENARIOS:
- scenario_dir = fixtures_root / scenario.name
- if scenario_dir.exists():
- shutil.rmtree(scenario_dir)
- scenario_dir.mkdir(parents=True, exist_ok=True)
- def _stub_extract(
- file_path,
- *,
- fixlevel=None,
- drawing_context=None,
- parse_warnings=None,
- parse_metadata=None,
- **_kwargs,
- ):
- if drawing_context is not None and scenario.assets:
- drawing_context.export_dir_path.mkdir(parents=True, exist_ok=True)
- for name, data in scenario.assets.items():
- (drawing_context.export_dir_path / name).write_bytes(data)
- if parse_metadata is not None:
- parse_metadata.update(scenario.parse_metadata)
- return [dict(b) for b in scenario.blocks]
- with tempfile.TemporaryDirectory(prefix="regen_") as tmp_root:
- input_dir = Path(tmp_root) / "inputs"
- input_dir.mkdir()
- source_path = input_dir / scenario.file_path
- source_path.parent.mkdir(parents=True, exist_ok=True)
- source_path.write_bytes(b"fake-docx")
- rag = build_debug_rag()
- with (
- mock.patch.dict("os.environ", {"INPUT_DIR": str(input_dir)}),
- mock.patch(
- "lightrag.parser.docx.parse_document.extract_docx_blocks",
- _stub_extract,
- ),
- mock.patch.object(
- pipeline_module,
- "archive_docx_source_after_full_docs_sync",
- _noop_archive,
- ),
- mock.patch("lightrag.sidecar.writer.datetime", FrozenDateTime),
- ):
- await rag.parse_native(
- scenario.doc_id,
- str(source_path),
- {
- "parse_format": FULL_DOCS_FORMAT_PENDING_PARSE,
- "content": "",
- "source_path": str(source_path),
- },
- )
- produced_dir = input_dir / PARSED_DIR_NAME / f"{scenario.file_path}.parsed"
- for item in produced_dir.rglob("*"):
- rel = item.relative_to(produced_dir)
- target = scenario_dir / rel
- target.parent.mkdir(parents=True, exist_ok=True)
- if item.is_dir():
- target.mkdir(exist_ok=True)
- else:
- shutil.copyfile(item, target)
- print(f"[regen] {scenario.name}: wrote {scenario_dir}")
- def main() -> None:
- asyncio.run(_regen())
- if __name__ == "__main__":
- main()
|