wxcz_admin
/
lightrag-cn-git-d5efd3


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
							"""End-to-end test: native docx → LightRAG Document → stable cache key.

The original bug this guards against: ``parse_native`` used to write a
runtime-stamped structured parser payload into ``full_docs.content``, so
re-parsing the same docx produced different
chunk-0 content and therefore different LLM cache keys.

After the fix, ``parse_native`` writes ``.blocks.jsonl`` + sidecars and
``full_docs`` is in LIGHTRAG format. ``_load_lightrag_document_content``
skips the ``meta`` line (which contains ``parse_time``) and concatenates
only ``"type": "content"`` rows, so re-parsing must yield byte-identical
``merged_text`` and stable downstream chunk-0 content.
"""

import asyncio
import json
from pathlib import Path

import pytest

from lightrag import LightRAG
from lightrag.constants import (
    FULL_DOCS_FORMAT_PENDING_PARSE,
    PARSED_DIR_NAME,
)
from lightrag.utils import Tokenizer, TokenizerInterface, compute_args_hash


def _block(content, *, heading="", level=0, parent=None, uuid="p1"):
    """Build a synthetic block dict matching extract_docx_blocks output."""
    return {
        "uuid": uuid,
        "uuid_end": uuid,
        "heading": heading,
        "content": content,
        "type": "text",
        "parent_headings": list(parent or []),
        "level": level,
        "table_chunk_role": "none",
    }


class _MiniFullDocs:
    def __init__(self):
        self.data = {}

    async def upsert(self, payload):
        self.data.update(payload)

    async def get_by_id(self, doc_id):
        return self.data.get(doc_id)

    async def index_done_callback(self):
        return None


class _MiniDocStatus:
    async def get_by_id(self, doc_id):
        return None

    async def upsert(self, data):
        return None


class _CharTokenizer(TokenizerInterface):
    def encode(self, content: str):
        return [ord(ch) for ch in content]

    def decode(self, tokens):
        return "".join(chr(t) for t in tokens)


class _MiniRag:
    """Just enough surface for parse_native + parser/docx adapter."""

    _persist_parsed_full_docs = LightRAG._persist_parsed_full_docs

    def __init__(self, working_dir):
        self.working_dir = str(working_dir)
        self.full_docs = _MiniFullDocs()
        self.doc_status = _MiniDocStatus()
        self.tokenizer = Tokenizer(model_name="char", tokenizer=_CharTokenizer())

    def _resolve_source_file_for_parser(self, file_path):
        return file_path


@pytest.mark.offline
def test_native_lightrag_path_produces_stable_merged_text(tmp_path, monkeypatch):
    """Re-parsing the same docx must yield byte-identical merged_text and
    therefore identical chunk_args_hash on chunk-0."""

    async def _run():
        input_dir = tmp_path / "input"
        input_dir.mkdir()
        monkeypatch.setenv("INPUT_DIR", str(input_dir))

        source_path = input_dir / "stable.docx"
        source_path.write_bytes(b"fake docx bytes")

        # Stub extract_docx_blocks at the adapter so the upstream DOCX
        # parser is never invoked. The adapter still does all the
        # LightRAG-specific writing — that is what we want under test.
        stable_blocks = [
            _block(
                "Title\nFirst paragraph body.\nSecond paragraph body.",
                heading="Title",
                level=1,
            ),
        ]

        def _stub_extract(file_path, fixlevel=None, drawing_context=None, **kwargs):
            return [dict(b) for b in stable_blocks]

        monkeypatch.setattr(
            "lightrag.parser.docx.parse_document.extract_docx_blocks",
            _stub_extract,
        )

        rag = _MiniRag(tmp_path / "work")

        # ---- First parse ----
        # parse_native archives the source after writing, so re-create it
        # before the second parse for a fair comparison.
        result1 = await LightRAG.parse_native(
            rag,
            "doc-stable",
            str(source_path),
            {"parse_format": FULL_DOCS_FORMAT_PENDING_PARSE, "content": ""},
        )
        merged1 = result1["content"]
        assert merged1, "first parse produced empty merged_text"

        # ---- Second parse ----
        # Restore the source file (archive moved it), reset the in-memory
        # full_docs row, and remove the parsed_dir so the writer rewrites
        # both meta (with a fresh parse_time) and content lines.
        source_path.write_bytes(b"fake docx bytes")
        rag.full_docs.data.clear()
        parsed_artifact_dir = input_dir / PARSED_DIR_NAME / f"{source_path.name}.parsed"
        if parsed_artifact_dir.exists():
            import shutil

            shutil.rmtree(parsed_artifact_dir)

        result2 = await LightRAG.parse_native(
            rag,
            "doc-stable",
            str(source_path),
            {"parse_format": FULL_DOCS_FORMAT_PENDING_PARSE, "content": ""},
        )
        merged2 = result2["content"]

        # Core invariant: merged_text byte-identical across runs even
        # though parse_time in the .blocks.jsonl meta line differs.
        assert merged1 == merged2

        # And: a hash computed over a chunk-0 derived from merged_text
        # must also be identical — that is what powers LLM cache hits.
        prompt_template = "EXTRACT_PROMPT::{text}"
        chunk0_a = prompt_template.format(text=merged1[:200])
        chunk0_b = prompt_template.format(text=merged2[:200])
        assert chunk0_a == chunk0_b
        assert compute_args_hash(chunk0_a) == compute_args_hash(chunk0_b)

        # And: full_docs.content uses the {{LRdoc}} marker plus a leading
        # summary derived from merged_text (not the legacy placeholder).
        record = rag.full_docs.data["doc-stable"]
        assert record["parse_format"] == "lightrag"
        assert record["content"].startswith("{{LRdoc}}")
        assert merged1[:40] in record["content"]

    asyncio.run(_run())


@pytest.mark.offline
def test_native_lightrag_path_writes_blocks_jsonl_and_skips_meta_on_load(
    tmp_path, monkeypatch
):
    """Sanity check: ``_load_lightrag_document_content`` must skip the
    meta line (where the runtime ``parse_time`` lives) and only return
    body content. This is what lets re-parsing produce stable text."""

    async def _run():
        input_dir = tmp_path / "input"
        input_dir.mkdir()
        monkeypatch.setenv("INPUT_DIR", str(input_dir))

        source_path = input_dir / "skipmeta.docx"
        source_path.write_bytes(b"fake")

        def _stub_extract(file_path, fixlevel=None, drawing_context=None, **kwargs):
            return [_block("the body")]

        monkeypatch.setattr(
            "lightrag.parser.docx.parse_document.extract_docx_blocks",
            _stub_extract,
        )

        rag = _MiniRag(tmp_path / "work")
        result = await LightRAG.parse_native(
            rag,
            "doc-skip",
            str(source_path),
            {"parse_format": FULL_DOCS_FORMAT_PENDING_PARSE, "content": ""},
        )

        # The .blocks.jsonl on disk DOES contain "parse_time" inside the
        # meta line; the merged_text returned by parse_native MUST NOT.
        blocks_path = result["blocks_path"]
        on_disk = open(blocks_path, "r", encoding="utf-8").read()
        assert "parse_time" in on_disk
        assert "parse_time" not in result["content"]
        assert result["content"].strip() == "the body"

    asyncio.run(_run())


@pytest.mark.offline
def test_native_lightrag_path_leaves_unknown_table_caption_empty(tmp_path, monkeypatch):
    """The native DOCX parser does not infer table titles, so its table
    sidecar must not synthesize captions like ``表1``.
    """

    async def _run():
        input_dir = tmp_path / "input"
        input_dir.mkdir()
        monkeypatch.setenv("INPUT_DIR", str(input_dir))

        source_path = input_dir / "table.docx"
        source_path.write_bytes(b"fake")

        def _stub_extract(file_path, fixlevel=None, drawing_context=None, **kwargs):
            return [_block('before\n<table>[["A"]]</table>\nafter')]

        monkeypatch.setattr(
            "lightrag.parser.docx.parse_document.extract_docx_blocks",
            _stub_extract,
        )

        rag = _MiniRag(tmp_path / "work")
        result = await LightRAG.parse_native(
            rag,
            "doc-table",
            str(source_path),
            {"parse_format": FULL_DOCS_FORMAT_PENDING_PARSE, "content": ""},
        )

        blocks_path = Path(result["blocks_path"])
        lines = blocks_path.read_text(encoding="utf-8").splitlines()
        block = json.loads(lines[1])
        assert "caption=" not in block["content"]
        assert "表1" not in block["content"]

        tables_path = blocks_path.with_suffix("").with_suffix(".tables.json")
        tables = json.loads(tables_path.read_text(encoding="utf-8"))
        table_entry = tables["tables"]["tb-table-0001"]
        assert table_entry["caption"] == ""

        # Surrounding is now backfilled at analyze_multimodal entry, not in
        # parse_native — invoke the same routine directly to mirror that.
        from lightrag.multimodal_context import enrich_sidecars_with_surrounding

        enrich_sidecars_with_surrounding(
            blocks_path=str(blocks_path),
            enabled_modalities={"tables"},
            tokenizer=rag.tokenizer,
        )
        tables = json.loads(tables_path.read_text(encoding="utf-8"))
        table_entry = tables["tables"]["tb-table-0001"]
        assert table_entry["surrounding"] == {
            "leading": "before\n",
            "trailing": "\nafter",
        }

    asyncio.run(_run())


@pytest.mark.offline
def test_analyze_entrypoint_backfills_surrounding_for_all_sidecars(
    tmp_path, monkeypatch
):
    """Surrounding is backfilled at analyze_multimodal entry, covering native
    parse output as well as any other sidecar-producing engine."""

    async def _run():
        input_dir = tmp_path / "input"
        input_dir.mkdir()
        monkeypatch.setenv("INPUT_DIR", str(input_dir))

        source_path = input_dir / "all_modalities.docx"
        source_path.write_bytes(b"fake")

        def _stub_extract(file_path, fixlevel=None, drawing_context=None, **kwargs):
            assert drawing_context is not None
            assert drawing_context.export_dir_path is not None
            (drawing_context.export_dir_path / "pic.png").write_bytes(b"PNG")
            return [
                _block(
                    'alpha <drawing id="1" format="png" '
                    'path="all_modalities.blocks.assets/pic.png" /> beta\n'
                    '<table>[["A"]]</table> gamma\n'
                    "<equation>E=mc^2</equation>\n"
                    "delta"
                )
            ]

        monkeypatch.setattr(
            "lightrag.parser.docx.parse_document.extract_docx_blocks",
            _stub_extract,
        )

        rag = _MiniRag(tmp_path / "work")
        result = await LightRAG.parse_native(
            rag,
            "doc-mm",
            str(source_path),
            {"parse_format": FULL_DOCS_FORMAT_PENDING_PARSE, "content": ""},
        )

        blocks_path = Path(result["blocks_path"])
        base = str(blocks_path)[: -len(".blocks.jsonl")]

        # Parse-time sidecars must NOT contain surrounding — that field is
        # now produced at analyze_multimodal entry.
        for root in ("drawings", "tables", "equations"):
            payload = json.loads(Path(base + f".{root}.json").read_text("utf-8"))
            for item in payload[root].values():
                assert "surrounding" not in item

        # Now invoke the same routine analyze_multimodal calls and verify
        # all modalities get populated.
        from lightrag.multimodal_context import enrich_sidecars_with_surrounding

        enrich_sidecars_with_surrounding(
            blocks_path=str(blocks_path),
            enabled_modalities={"drawings", "tables", "equations"},
            tokenizer=rag.tokenizer,
        )
        for root in ("drawings", "tables", "equations"):
            payload = json.loads(Path(base + f".{root}.json").read_text("utf-8"))
            items = payload[root]
            assert items
            for item in items.values():
                assert "surrounding" in item
                assert set(item["surrounding"]) == {"leading", "trailing"}

    asyncio.run(_run())


@pytest.mark.offline
def test_native_lightrag_path_writes_image_assets_to_blocks_assets_dir(
    tmp_path, monkeypatch
):
    """Native parsing must drop image bytes into ``<base>.blocks.assets/``
    after the adapter creates the parsed dir (which it wipes at the start),
    and the drawings sidecar must reference the rewritten ids.
    """
    from pathlib import Path

    async def _run():
        input_dir = tmp_path / "input"
        input_dir.mkdir()
        monkeypatch.setenv("INPUT_DIR", str(input_dir))

        source_path = input_dir / "with_pics.docx"
        source_path.write_bytes(b"fake")

        def _stub_extract(file_path, fixlevel=None, drawing_context=None, **kwargs):
            # The adapter already created the asset dir before calling us;
            # write the fake image bytes there as a side-effect, then return
            # a block whose content references that asset via <drawing .../>.
            assert drawing_context is not None
            assert drawing_context.export_dir_path is not None
            (drawing_context.export_dir_path / "pic.png").write_bytes(b"PNG-BYTES")
            return [
                _block(
                    "intro\n"
                    '<drawing id="1" name="pic" format="png" '
                    'path="with_pics.blocks.assets/pic.png" />\n'
                    "outro",
                    heading="intro",
                    level=1,
                ),
            ]

        monkeypatch.setattr(
            "lightrag.parser.docx.parse_document.extract_docx_blocks",
            _stub_extract,
        )

        rag = _MiniRag(tmp_path / "work")
        result = await LightRAG.parse_native(
            rag,
            "doc-pic",
            str(source_path),
            {"parse_format": FULL_DOCS_FORMAT_PENDING_PARSE, "content": ""},
        )

        blocks_path = Path(result["blocks_path"])
        parsed_dir = blocks_path.parent
        asset_dir = parsed_dir / "with_pics.blocks.assets"
        # Asset dir must exist alongside .blocks.jsonl and survive the
        # adapter's parsed_dir cleanup step.
        assert asset_dir.is_dir(), (
            f"asset dir not created at {asset_dir}; parsed_dir contents: "
            f"{list(parsed_dir.iterdir())}"
        )
        assert (asset_dir / "pic.png").read_bytes() == b"PNG-BYTES"
        # And drawings.json sidecar should also be there since the block
        # contained a <drawing .../> markup the adapter had to record.
        assert (parsed_dir / "with_pics.drawings.json").is_file()

    asyncio.run(_run())