wxcz_admin
/
lightrag-cn-git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
							"""Scenario fixtures for the native docx → SidecarWriter migration tests.

Each scenario describes:

- ``blocks``      — what ``extract_docx_blocks`` would return (the synthetic
  block dicts that the adapter consumes).
- ``parse_metadata`` — the dict the upstream parser fills in (only
  ``first_heading`` is currently consumed by the adapter).
- ``assets``      — files the upstream extractor would have written into
  ``<base>.blocks.assets/`` before the IR builder runs. Maps relative names
  inside the asset dir → byte content.
- ``doc_id``      — fixed so blockid + sidecar ids are deterministic.
- ``file_path``   — used for canonical basename / doc_title fallback.

The captured outputs (``blocks.jsonl`` + per-modality JSONs + assets) live
under ``tests/parser/docx/golden/native_docx/<scenario>/``. The
production path (``LightRAG.parse_native``) must produce byte-identical
bytes vs those fixtures; the regen script under ``scripts/`` rewrites
them when the format intentionally changes.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any


def _block(
    content: str,
    *,
    heading: str = "",
    level: int = 0,
    parent: list[str] | None = None,
    uuid: str = "p1",
    uuid_end: str | None = None,
    table_headers: list[Any] | None = None,
    table_chunk_role: str = "none",
) -> dict[str, Any]:
    """Build a synthetic block matching ``extract_docx_blocks`` output."""
    out: dict[str, Any] = {
        "uuid": uuid,
        "uuid_end": uuid_end if uuid_end is not None else uuid,
        "heading": heading,
        "content": content,
        "type": "text",
        "parent_headings": list(parent or []),
        "level": level,
        "table_chunk_role": table_chunk_role,
    }
    if table_headers is not None:
        out["table_headers"] = table_headers
    return out


@dataclass
class Scenario:
    name: str
    doc_id: str
    file_path: str  # canonical-ish; what the pipeline would pass
    blocks: list[dict[str, Any]]
    parse_metadata: dict[str, Any] = field(default_factory=dict)
    assets: dict[str, bytes] = field(default_factory=dict)


SCENARIOS: list[Scenario] = [
    # --- 1: text-only, multi-heading -----------------------------------
    Scenario(
        name="text_only_hierarchy",
        doc_id="doc-aaaa111122223333aaaa111122223333",
        file_path="paper.docx",
        parse_metadata={"first_heading": "Introduction"},
        blocks=[
            _block(
                "Introduction",
                heading="Introduction",
                level=1,
                uuid="h1",
            ),
            _block(
                "Body paragraph one.",
                heading="Introduction",
                level=1,
                uuid="p1",
                uuid_end="p2",
            ),
            _block(
                "Background",
                heading="Background",
                level=2,
                parent=["Introduction"],
                uuid="h2",
            ),
            _block(
                "Sub body.",
                heading="Background",
                level=2,
                parent=["Introduction"],
                uuid="p3",
            ),
        ],
    ),
    # --- 2: block + inline equations -----------------------------------
    Scenario(
        name="equations_block_and_inline",
        doc_id="doc-bbbb222233334444bbbb222233334444",
        file_path="formulas.docx",
        parse_metadata={"first_heading": "Equations"},
        blocks=[
            _block(
                "Equations",
                heading="Equations",
                level=1,
                uuid="h1",
            ),
            _block(
                # Inline equation (no surrounding \n on either side)
                "Energy is <equation>E=mc^2</equation> per Einstein.",
                heading="Equations",
                level=1,
                uuid="p1",
            ),
            _block(
                # Block equation (wedged between newlines)
                "Consider:\n<equation>x^2 + y^2 = r^2</equation>\nThe circle equation.",
                heading="Equations",
                level=1,
                uuid="p2",
            ),
            _block(
                # Block at content edge (start == 0)
                "<equation>a + b = c</equation>\ntext after",
                heading="Equations",
                level=1,
                uuid="p3",
            ),
        ],
    ),
    # --- 3: tables with and without table_headers ----------------------
    Scenario(
        name="tables_mixed",
        doc_id="doc-cccc333344445555cccc333344445555",
        file_path="report.docx",
        parse_metadata={"first_heading": "Report"},
        blocks=[
            _block(
                "Report",
                heading="Report",
                level=1,
                uuid="h1",
            ),
            _block(
                # Table with table_headers (cross-page repeating)
                'See table:\n<table>[["X","Y"],["1","2"],["3","4"]]</table>',
                heading="Report",
                level=1,
                uuid="t1",
                table_headers=[[["X", "Y"]]],  # one table, one header row
            ),
            _block(
                # Table without table_headers
                'Plain table:\n<table>[["a","b"]]</table>',
                heading="Report",
                level=1,
                uuid="t2",
            ),
            _block(
                # Two tables in one block
                '<table>[["p"]]</table>\nthen\n<table>[["q","r"],["s","t"]]</table>',
                heading="Report",
                level=1,
                uuid="t3",
                table_headers=[None, [["q", "r"]]],
            ),
        ],
    ),
    # --- 4: drawings + assets ------------------------------------------
    Scenario(
        name="drawings_with_assets",
        doc_id="doc-dddd444455556666dddd444455556666",
        file_path="diagrams.docx",
        parse_metadata={"first_heading": "Diagrams"},
        assets={
            "fig1.png": b"\x89PNG\r\n\x1a\n-fig1-fake",
            "fig2.jpg": b"\xff\xd8\xff\xe0-fig2-fake",
        },
        blocks=[
            _block(
                "Diagrams",
                heading="Diagrams",
                level=1,
                uuid="h1",
            ),
            _block(
                "Figure one:\n"
                '<drawing id="x" format="png" '
                'path="diagrams.blocks.assets/fig1.png" '
                'src="docx://image1" />\n'
                "Figure two:\n"
                '<drawing id="y" format="jpg" '
                'path="diagrams.blocks.assets/fig2.jpg" '
                'src="docx://image2" />',
                heading="Diagrams",
                level=1,
                uuid="p1",
            ),
        ],
    ),
    # --- 5: all modalities mixed ---------------------------------------
    Scenario(
        name="all_modalities",
        doc_id="doc-eeee555566667777eeee555566667777",
        file_path="combo.docx",
        parse_metadata={"first_heading": "Combined"},
        assets={"pic.png": b"PNG-combo"},
        blocks=[
            _block(
                "Combined",
                heading="Combined",
                level=1,
                uuid="h1",
            ),
            _block(
                "Look at this figure:\n"
                '<drawing id="z" format="png" '
                'path="combo.blocks.assets/pic.png" '
                'src="docx://img" />\n'
                "Plus a table:\n"
                '<table>[["α","β"],["γ","δ"]]</table>\n'
                "And a block equation:\n"
                "<equation>F = ma</equation>\n"
                "And an inline <equation>v=d/t</equation> here.",
                heading="Combined",
                level=1,
                uuid="p1",
            ),
        ],
    ),
    # --- 6: empty block dropped ----------------------------------------
    Scenario(
        name="empty_block_dropped",
        doc_id="doc-ffff666677778888ffff666677778888",
        file_path="sparse.docx",
        parse_metadata={"first_heading": "Sparse"},
        blocks=[
            _block(
                "Sparse",
                heading="Sparse",
                level=1,
                uuid="h1",
            ),
            _block(
                "   \n   ",  # strips to empty — must be dropped
                heading="Sparse",
                level=1,
                uuid="p_empty",
            ),
            _block(
                "Real content after empty.",
                heading="Sparse",
                level=1,
                uuid="p_real",
            ),
        ],
    ),
    # --- 7: external / linked image references ------------------------
    # DOCX can carry ``<a:blip r:link="rId…"/>`` references to image
    # targets that live outside the package — the upstream extractor
    # then emits ``<drawing path="<external URL or unresolved path>" />``
    # WITHOUT writing bytes into ``<base>.blocks.assets/``. The adapter
    # must pass those paths through verbatim (both in ``blocks.jsonl``
    # and ``drawings.json``); turning them into AssetSpecs with
    # ``source=None`` would make the writer warn-and-skip → ``path=""``,
    # losing the only reference downstream consumers have.
    Scenario(
        name="external_image_link",
        doc_id="doc-1111aaaa2222bbbb1111aaaa2222bbbb",
        file_path="linked.docx",
        parse_metadata={"first_heading": "Linked"},
        # No on-disk assets — the path points elsewhere.
        assets={},
        blocks=[
            _block(
                "Linked",
                heading="Linked",
                level=1,
                uuid="h1",
            ),
            _block(
                "See the diagram online:\n"
                '<drawing id="z" format="png" '
                'path="https://example.com/diagrams/architecture.png" '
                'src="docx://external" />\n'
                "And a relative-but-not-asset path:\n"
                '<drawing id="z2" format="gif" '
                'path="../images/legacy.gif" '
                'src="docx://legacy" />',
                heading="Linked",
                level=1,
                uuid="p1",
            ),
        ],
    ),
    # --- 8: missing paraid ---------------------------------------------
    Scenario(
        name="missing_paraid",
        doc_id="doc-99990000111122229999000011112222",
        file_path="legacy.docx",
        parse_metadata={"first_heading": ""},  # no headings at all
        blocks=[
            _block(
                "Just plain text without a heading.",
                heading="",
                level=0,
                uuid="",  # missing
                uuid_end="",
            ),
            _block(
                "Another paragraph with no paraId.",
                heading="",
                level=0,
                uuid="",
                uuid_end="",
            ),
        ],
    ),
]


__all__ = ["Scenario", "SCENARIOS", "_block"]