wxcz_admin
/
lightrag-cn-git


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103
							"""Tests for :class:`DoclingIRBuilder`.

Each test constructs a minimal inline DoclingDocument dict — the smallest
JSON that exercises one mapping rule from
``docs/DoclingSidecarRefactorPlan-zh.md`` §5. The point is to lock down
contracts that the integration test (running against the live fixture)
cannot inspect cleanly, not to faithfully replicate the docling-serve
output schema.
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any

import pytest

from lightrag.parser.external.docling.ir_builder import DoclingIRBuilder


# ---------------------------------------------------------------------------
# Helpers to build inline fixtures
# ---------------------------------------------------------------------------


def _write_doc(tmp_path: Path, payload: dict, *, stem: str = "demo") -> Path:
    raw_dir = tmp_path / f"{stem}.docling_raw"
    raw_dir.mkdir()
    (raw_dir / f"{stem}.json").write_text(json.dumps(payload), encoding="utf-8")
    return raw_dir


def _doc(
    *,
    body_children: list[str],
    texts: list[dict] | None = None,
    tables: list[dict] | None = None,
    pictures: list[dict] | None = None,
    groups: list[dict] | None = None,
    key_value_items: list[dict] | None = None,
    form_items: list[dict] | None = None,
) -> dict:
    return {
        "schema_name": "DoclingDocument",
        "version": "1.10.0",
        "origin": {"filename": "demo.pdf", "mimetype": "application/pdf"},
        "body": {
            "self_ref": "#/body",
            "children": [{"$ref": r} for r in body_children],
            "content_layer": "body",
            "label": "unspecified",
        },
        "groups": groups or [],
        "texts": texts or [],
        "pictures": pictures or [],
        "tables": tables or [],
        "key_value_items": key_value_items or [],
        "form_items": form_items or [],
    }


def _text_item(
    *,
    label: str,
    text: str,
    self_ref: str,
    level: int | None = None,
    orig: str | None = None,
    page_no: int = 1,
    bbox: tuple[float, float, float, float] = (10.0, 100.0, 200.0, 80.0),
    coord_origin: str = "BOTTOMLEFT",
    content_layer: str = "body",
    marker: str | None = None,
) -> dict:
    item: dict[str, Any] = {
        "self_ref": self_ref,
        "label": label,
        "text": text,
        "orig": orig if orig is not None else text,
        "content_layer": content_layer,
        "prov": [
            {
                "page_no": page_no,
                "bbox": {
                    "l": bbox[0],
                    "t": bbox[1],
                    "r": bbox[2],
                    "b": bbox[3],
                    "coord_origin": coord_origin,
                },
                "charspan": [0, len(text)],
            }
        ],
    }
    if level is not None:
        item["level"] = level
    if marker is not None:
        item["marker"] = marker
    return item


@pytest.fixture(autouse=True)
def _clean_env(monkeypatch: pytest.MonkeyPatch) -> None:
    for name in ("DOCLING_BBOX_ATTRIBUTES", "DOCLING_ENGINE_VERSION"):
        monkeypatch.delenv(name, raising=False)


# ---------------------------------------------------------------------------
# 1. Heading hierarchy
# ---------------------------------------------------------------------------


def test_docling_adapter_simple_heading_hierarchy(tmp_path: Path) -> None:
    """Three distinct sections without adjacency-merge folding.

    Background and Details each carry their own body, so we end up with one
    block per heading and a clean parent-heading chain.
    """
    texts = [
        _text_item(label="title", text="Whole Doc Title", self_ref="#/texts/0"),
        _text_item(label="text", text="Title-level body.", self_ref="#/texts/1"),
        _text_item(
            label="section_header", text="Background", level=1, self_ref="#/texts/2"
        ),
        _text_item(label="text", text="Some intro body.", self_ref="#/texts/3"),
        _text_item(
            label="section_header", text="Details", level=2, self_ref="#/texts/4"
        ),
        _text_item(label="text", text="Detail content.", self_ref="#/texts/5"),
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(
            body_children=[
                "#/texts/0",
                "#/texts/1",
                "#/texts/2",
                "#/texts/3",
                "#/texts/4",
                "#/texts/5",
            ],
            texts=texts,
        ),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")

    assert ir.doc_title == "Whole Doc Title"
    headings = [(b.heading, b.level, b.parent_headings) for b in ir.blocks]
    # title (level=1), section_header level=1 → IR level 2, section_header level=2 → IR level 3
    assert headings == [
        ("Whole Doc Title", 1, []),
        ("Background", 2, ["Whole Doc Title"]),
        ("Details", 3, ["Whole Doc Title", "Background"]),
    ]
    # heading line is rendered with markdown prefix as the FIRST line
    assert ir.blocks[0].content_template.splitlines()[0] == "# Whole Doc Title"
    assert ir.blocks[1].content_template.splitlines()[0] == "## Background"
    assert ir.blocks[2].content_template.splitlines()[0] == "### Details"


def test_docling_adapter_adjacency_merge_folds_empty_heading(tmp_path: Path) -> None:
    """When a heading block has no body and the next heading is deeper,
    the deeper heading folds in as a body line (matches MinerU §5.1.4)."""
    texts = [
        _text_item(label="title", text="Whole Doc Title", self_ref="#/texts/0"),
        _text_item(
            label="section_header", text="Background", level=1, self_ref="#/texts/1"
        ),
        _text_item(label="text", text="Body for Background.", self_ref="#/texts/2"),
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(
            body_children=["#/texts/0", "#/texts/1", "#/texts/2"],
            texts=texts,
        ),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    # Title had no body → Background folded into it as a `## ` line
    assert len(ir.blocks) == 1
    block = ir.blocks[0]
    assert block.heading == "Whole Doc Title"
    assert block.level == 1
    lines = block.content_template.splitlines()
    assert lines[0] == "# Whole Doc Title"
    assert "## Background" in lines
    assert "Body for Background." in lines


def test_docling_adapter_preserves_docling_heading_level(tmp_path: Path) -> None:
    """When Docling reports all section_headers at level=1, the adapter
    preserves that (no numbering-based level inference)."""
    texts = [
        _text_item(
            label="section_header", text="1 Purpose", level=1, self_ref="#/texts/0"
        ),
        _text_item(
            label="section_header", text="2.1 Electrical", level=1, self_ref="#/texts/1"
        ),
        _text_item(
            label="section_header",
            text="2.4.5 Temperature",
            level=1,
            self_ref="#/texts/2",
        ),
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(
            body_children=["#/texts/0", "#/texts/1", "#/texts/2"],
            texts=texts,
        ),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    levels = [b.level for b in ir.blocks]
    assert levels == [2, 2, 2]  # all bumped by +1, no normalization


# ---------------------------------------------------------------------------
# 2. Multimodal payloads under one heading
# ---------------------------------------------------------------------------


def test_docling_adapter_merges_payloads_under_heading(tmp_path: Path) -> None:
    texts = [
        _text_item(
            label="section_header", text="Section", level=1, self_ref="#/texts/0"
        ),
        _text_item(label="text", text="Inline body line.", self_ref="#/texts/1"),
    ]
    tables = [
        {
            "self_ref": "#/tables/0",
            "label": "table",
            "content_layer": "body",
            "data": {
                "num_rows": 1,
                "num_cols": 2,
                "grid": [[{"text": "A"}, {"text": "B"}]],
            },
            "prov": [],
        }
    ]
    pictures = [
        {
            "self_ref": "#/pictures/0",
            "label": "picture",
            "content_layer": "body",
            "image": {"uri": "artifacts/foo.png", "mimetype": "image/png"},
            "prov": [],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(
            body_children=[
                "#/texts/0",
                "#/texts/1",
                "#/tables/0",
                "#/pictures/0",
            ],
            texts=texts,
            tables=tables,
            pictures=pictures,
        ),
    )
    (raw_dir / "artifacts").mkdir()
    (raw_dir / "artifacts" / "foo.png").write_bytes(b"\x89PNG fake")

    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    assert len(ir.blocks) == 1
    block = ir.blocks[0]
    template = block.content_template
    # one of each placeholder appears in source order
    assert "{{TBL:tb1}}" in template
    assert "{{IMG:im2}}" in template
    assert template.index("{{TBL:tb1}}") < template.index("{{IMG:im2}}")
    assert len(block.tables) == 1
    assert block.tables[0].rows == [["A", "B"]]
    assert len(block.drawings) == 1
    assert block.drawings[0].asset_ref == "artifacts/foo.png"
    assert block.drawings[0].fmt == "png"
    assert any(a.ref == "artifacts/foo.png" for a in ir.assets)


def test_docling_adapter_visits_text_children_for_modalities(
    tmp_path: Path,
) -> None:
    texts = [
        _text_item(
            label="section_header",
            text="Section",
            level=1,
            self_ref="#/texts/0",
        ),
        _text_item(label="text", text="Child paragraph.", self_ref="#/texts/1"),
        _text_item(
            label="formula",
            text="E = mc^2",
            orig="E = mc^2",
            self_ref="#/texts/2",
        ),
    ]
    texts[0]["children"] = [
        {"$ref": "#/texts/1"},
        {"$ref": "#/tables/0"},
        {"$ref": "#/pictures/0"},
        {"$ref": "#/texts/2"},
    ]
    tables = [
        {
            "self_ref": "#/tables/0",
            "label": "table",
            "content_layer": "body",
            "data": {
                "num_rows": 1,
                "num_cols": 2,
                "grid": [[{"text": "A"}, {"text": "B"}]],
            },
            "prov": [],
        }
    ]
    pictures = [
        {
            "self_ref": "#/pictures/0",
            "label": "picture",
            "content_layer": "body",
            "image": {"uri": "artifacts/foo.png", "mimetype": "image/png"},
            "prov": [],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(
            body_children=["#/texts/0"],
            texts=texts,
            tables=tables,
            pictures=pictures,
        ),
    )
    (raw_dir / "artifacts").mkdir()
    (raw_dir / "artifacts" / "foo.png").write_bytes(b"\x89PNG fake")

    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    assert len(ir.blocks) == 1
    block = ir.blocks[0]
    assert "Child paragraph." in block.content_template
    assert "{{TBL:tb1}}" in block.content_template
    assert "{{IMG:im2}}" in block.content_template
    assert "{{EQ:eq3}}" in block.content_template
    assert len(block.tables) == 1
    assert len(block.drawings) == 1
    assert len(block.equations) == 1
    assert block.equations[0].is_block is True


# ---------------------------------------------------------------------------
# 3. Inline groups
# ---------------------------------------------------------------------------


def test_docling_adapter_inline_group_joins_children(tmp_path: Path) -> None:
    texts = [
        _text_item(label="section_header", text="S", level=1, self_ref="#/texts/0"),
        _text_item(label="text", text="hello", self_ref="#/texts/1"),
        _text_item(label="text", text="world", self_ref="#/texts/2"),
    ]
    groups = [
        {
            "self_ref": "#/groups/0",
            "label": "inline",
            "content_layer": "body",
            "children": [{"$ref": "#/texts/1"}, {"$ref": "#/texts/2"}],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(
            body_children=["#/texts/0", "#/groups/0"],
            texts=texts,
            groups=groups,
        ),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    assert "hello world" in ir.blocks[0].content_template


def test_docling_adapter_inline_group_emits_inline_formula(
    tmp_path: Path,
) -> None:
    texts = [
        _text_item(label="section_header", text="S", level=1, self_ref="#/texts/0"),
        _text_item(label="text", text="alpha", self_ref="#/texts/1"),
        _text_item(
            label="formula",
            text="x_i",
            orig="x_i",
            self_ref="#/texts/2",
        ),
        _text_item(label="text", text="omega", self_ref="#/texts/3"),
    ]
    groups = [
        {
            "self_ref": "#/groups/0",
            "label": "inline",
            "content_layer": "body",
            "children": [
                {"$ref": "#/texts/1"},
                {"$ref": "#/texts/2"},
                {"$ref": "#/texts/3"},
            ],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(
            body_children=["#/texts/0", "#/groups/0"],
            texts=texts,
            groups=groups,
        ),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    block = ir.blocks[0]
    assert "alpha {{EQI:eq1}} omega" in block.content_template
    assert [eq.is_block for eq in block.equations] == [False]
    assert block.equations[0].latex == "x_i"


# ---------------------------------------------------------------------------
# 4. Tables — grid & header
# ---------------------------------------------------------------------------


def test_docling_adapter_table_grid_and_header(tmp_path: Path) -> None:
    tables = [
        {
            "self_ref": "#/tables/0",
            "label": "table",
            "content_layer": "body",
            "captions": [{"$ref": "#/texts/0"}],
            "footnotes": [{"$ref": "#/texts/1"}],
            "data": {
                "num_rows": 2,
                "num_cols": 2,
                "grid": [
                    [
                        {
                            "text": "h1",
                            "column_header": True,
                            "start_row_offset_idx": 0,
                        },
                        {
                            "text": "h2",
                            "column_header": True,
                            "start_row_offset_idx": 0,
                        },
                    ],
                    [{"text": "a"}, {"text": "b"}],
                ],
            },
            "prov": [],
        }
    ]
    texts = [
        _text_item(label="caption", text="Table caption", self_ref="#/texts/0"),
        _text_item(label="footnote", text="Note: x", self_ref="#/texts/1"),
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(
            body_children=["#/tables/0"],
            texts=texts,
            tables=tables,
        ),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    assert len(ir.blocks) == 1
    table = ir.blocks[0].tables[0]
    assert table.rows == [["h1", "h2"], ["a", "b"]]
    assert table.num_rows == 2
    assert table.num_cols == 2
    assert table.caption == "Table caption"
    assert table.footnotes == ["Note: x"]
    assert table.table_header == [["h1", "h2"]]
    assert table.self_ref == "#/tables/0"


def test_docling_adapter_empty_table_dropped(tmp_path: Path) -> None:
    """Table items with no usable body MUST NOT enter the IR.

    Docling never populates ``IRTable.html``, so a body-less table would
    land in the sidecar as ``content=""`` and trip the analyze worker's
    "missing table content" path. Mirrors the MinerU-side filter in
    lightrag/parser/external/mineru/ir_builder.py.
    """
    # Four shapes of "no visible content" — all must be dropped.
    tables = [
        # 1) ``data`` missing entirely.
        {"self_ref": "#/tables/0", "label": "table", "content_layer": "body"},
        # 2) Empty grid.
        {
            "self_ref": "#/tables/1",
            "label": "table",
            "content_layer": "body",
            "data": {"num_rows": 0, "num_cols": 0, "grid": []},
        },
        # 3) Grid with only blank cell text.
        {
            "self_ref": "#/tables/2",
            "label": "table",
            "content_layer": "body",
            "data": {
                "num_rows": 1,
                "num_cols": 2,
                "grid": [[{"text": ""}, {"text": "   "}]],
            },
        },
        # 4) table_cells fallback yields a blank grid.
        {
            "self_ref": "#/tables/3",
            "label": "table",
            "content_layer": "body",
            "data": {
                "num_rows": 1,
                "num_cols": 1,
                "table_cells": [
                    {
                        "text": "",
                        "start_row_offset_idx": 0,
                        "end_row_offset_idx": 1,
                        "start_col_offset_idx": 0,
                        "end_col_offset_idx": 1,
                    }
                ],
            },
        },
    ]
    texts = [_text_item(label="text", text="kept", self_ref="#/texts/0")]
    raw_dir = _write_doc(
        tmp_path,
        _doc(
            body_children=[
                "#/tables/0",
                "#/tables/1",
                "#/tables/2",
                "#/tables/3",
                "#/texts/0",
            ],
            texts=texts,
            tables=tables,
        ),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    table_count = sum(len(b.tables) for b in ir.blocks)
    assert table_count == 0
    joined = "\n".join(b.content_template for b in ir.blocks)
    assert "TBL:" not in joined
    assert "kept" in joined


def test_docling_adapter_table_extras_is_empty(tmp_path: Path) -> None:
    """`IRTable.extras` is intentionally left blank by the docling adapter:
    the historical ``parent`` / ``children_refs`` / ``references`` /
    ``annotations`` / ``cells`` fields were never consumed downstream and
    bloated ``tables.json`` by ~50%. Construct a table that *would* have
    populated all five legacy fields and assert ``extras == {}``."""
    tables = [
        {
            "self_ref": "#/tables/0",
            "label": "table",
            "content_layer": "body",
            "parent": {"$ref": "#/body"},
            "children": [{"$ref": "#/texts/0"}],
            "references": [{"foo": "bar"}],
            "annotations": [{"note": "x"}],
            "data": {
                "num_rows": 1,
                "num_cols": 1,
                "grid": [[{"text": "x"}]],
                "table_cells": [
                    {
                        "text": "x",
                        "row_span": 1,
                        "col_span": 1,
                        "start_row_offset_idx": 0,
                        "end_row_offset_idx": 1,
                        "start_col_offset_idx": 0,
                        "end_col_offset_idx": 1,
                        "bbox": {"l": 1, "t": 2, "r": 3, "b": 4},
                    }
                ],
            },
            "prov": [],
        }
    ]
    texts = [_text_item(label="caption", text="c", self_ref="#/texts/0")]
    raw_dir = _write_doc(
        tmp_path,
        _doc(body_children=["#/tables/0"], texts=texts, tables=tables),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    assert ir.blocks[0].tables[0].extras == {}


# ---------------------------------------------------------------------------
# 5. Picture — referenced asset
# ---------------------------------------------------------------------------


def test_docling_adapter_picture_referenced_asset(tmp_path: Path) -> None:
    pictures = [
        {
            "self_ref": "#/pictures/0",
            "label": "picture",
            "content_layer": "body",
            "image": {
                "uri": "artifacts/image_000000_abc.png",
                "mimetype": "image/png",
                "size": {"width": 100.0, "height": 200.0},
            },
            "prov": [],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(body_children=["#/pictures/0"], pictures=pictures),
    )
    art = raw_dir / "artifacts"
    art.mkdir()
    asset = art / "image_000000_abc.png"
    asset.write_bytes(b"\x89PNG fake")

    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    drawing = ir.blocks[0].drawings[0]
    assert drawing.asset_ref == "artifacts/image_000000_abc.png"
    assert drawing.fmt == "png"
    assert drawing.self_ref == "#/pictures/0"
    [a] = [a for a in ir.assets if a.ref == drawing.asset_ref]
    assert a.source == asset
    assert a.suggested_name == "image_000000_abc.png"
    # intrinsic_size lands in extras for downstream VLM filtering
    assert drawing.extras["intrinsic_size"] == [100.0, 200.0]


# ---------------------------------------------------------------------------
# 6. Positions & bbox_attributes
# ---------------------------------------------------------------------------


def test_docling_adapter_positions_and_bbox_attributes(tmp_path: Path) -> None:
    texts = [
        _text_item(
            label="text",
            text="A",
            self_ref="#/texts/0",
            page_no=1,
            bbox=(10.0, 100.0, 200.0, 80.0),
            coord_origin="BOTTOMLEFT",
        ),
        _text_item(
            label="text",
            text="B",
            self_ref="#/texts/1",
            page_no=2,
            bbox=(20.0, 50.0, 220.0, 30.0),
            coord_origin="TOPLEFT",
        ),
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(body_children=["#/texts/0", "#/texts/1"], texts=texts),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    assert ir.bbox_attributes == {"origin": "LEFTBOTTOM"}
    # no max / page_sizes leaks
    assert set(ir.bbox_attributes.keys()) == {"origin"}

    positions = ir.blocks[0].positions
    bbox_positions = [p for p in positions if p.range]
    assert len(bbox_positions) == 2
    bl = next(p for p in bbox_positions if p.anchor == "1")
    tl = next(p for p in bbox_positions if p.anchor == "2")
    assert bl.range == [10.0, 100.0, 200.0, 80.0]
    assert bl.origin is None  # inherits doc-level LEFTBOTTOM
    assert tl.origin == "LEFTTOP"  # per-position override
    assert tl.range == [20.0, 50.0, 220.0, 30.0]


def test_docling_adapter_bbox_attributes_env_override(
    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
    monkeypatch.setenv("DOCLING_BBOX_ATTRIBUTES", '{"origin":"LEFTTOP"}')
    texts = [
        _text_item(
            label="text",
            text="A",
            self_ref="#/texts/0",
            coord_origin="BOTTOMLEFT",
        )
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(body_children=["#/texts/0"], texts=texts),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    assert ir.bbox_attributes == {"origin": "LEFTTOP"}


# ---------------------------------------------------------------------------
# 7. caption / footnote refs (positive + sibling-not-consumed)
# ---------------------------------------------------------------------------


def test_docling_adapter_caption_refs_only(tmp_path: Path) -> None:
    """The caption referenced by tables[0].captions is consumed (kept in
    IRTable.caption, dropped from reading flow). Sibling text NOT
    referenced — even when it looks like a caption — stays in the reading
    flow."""
    texts = [
        _text_item(label="caption", text="Tab1 caption", self_ref="#/texts/0"),
        _text_item(label="text", text="Tab1 sibling", self_ref="#/texts/1"),
        _text_item(label="caption", text="Orphan caption", self_ref="#/texts/2"),
    ]
    tables = [
        {
            "self_ref": "#/tables/0",
            "label": "table",
            "content_layer": "body",
            "captions": [{"$ref": "#/texts/0"}],
            "data": {"num_rows": 1, "num_cols": 1, "grid": [[{"text": "x"}]]},
            "prov": [],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(
            body_children=["#/tables/0", "#/texts/1", "#/texts/2"],
            texts=texts,
            tables=tables,
        ),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    block = ir.blocks[0]
    assert block.tables[0].caption == "Tab1 caption"
    # consumed caption ref does not leak into body text
    assert "Tab1 caption" not in block.content_template
    # orphan caption and sibling text DO appear in body
    assert "Tab1 sibling" in block.content_template
    assert "Orphan caption" in block.content_template


def test_docling_adapter_footnotes_refs_only(tmp_path: Path) -> None:
    texts = [
        _text_item(label="footnote", text="Linked footnote", self_ref="#/texts/0"),
        _text_item(label="text", text="注: this is sibling note", self_ref="#/texts/1"),
    ]
    tables = [
        {
            "self_ref": "#/tables/0",
            "label": "table",
            "content_layer": "body",
            "footnotes": [{"$ref": "#/texts/0"}],
            "data": {"num_rows": 1, "num_cols": 1, "grid": [[{"text": "x"}]]},
            "prov": [],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(
            body_children=["#/tables/0", "#/texts/1"],
            texts=texts,
            tables=tables,
        ),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    block = ir.blocks[0]
    assert block.tables[0].footnotes == ["Linked footnote"]
    assert "Linked footnote" not in block.content_template
    assert "注: this is sibling note" in block.content_template


def test_docling_adapter_table_refs_skip_non_body_caption_footnote(
    tmp_path: Path,
) -> None:
    # A body table references a caption/footnote whose targets sit in
    # content_layer="furniture" — typically a page header/footer that
    # docling mislabeled and linked to the table. The adapter contract is
    # that furniture text must never leak into sidecar metadata, so the
    # IRTable's caption/footnotes lists must come back empty (and the body
    # reading flow must not pick up the furniture text either).
    texts = [
        _text_item(
            label="caption",
            text="Page header masquerading as caption",
            self_ref="#/texts/0",
            content_layer="furniture",
        ),
        _text_item(
            label="footnote",
            text="Page footer masquerading as footnote",
            self_ref="#/texts/1",
            content_layer="furniture",
        ),
    ]
    tables = [
        {
            "self_ref": "#/tables/0",
            "label": "table",
            "content_layer": "body",
            "captions": [{"$ref": "#/texts/0"}],
            "footnotes": [{"$ref": "#/texts/1"}],
            "data": {"num_rows": 1, "num_cols": 1, "grid": [[{"text": "x"}]]},
            "prov": [],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(body_children=["#/tables/0"], texts=texts, tables=tables),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    block = ir.blocks[0]
    assert block.tables[0].caption == ""
    assert block.tables[0].footnotes == []
    assert "Page header masquerading" not in block.content_template
    assert "Page footer masquerading" not in block.content_template


def test_docling_adapter_picture_children_fallback_skips_non_body(
    tmp_path: Path,
) -> None:
    # Same invariant for the children fallback path: a body picture has no
    # explicit captions/footnotes, but its ``children`` list refs a caption
    # whose target is furniture. ``_resolve_children_with_label`` must
    # skip it rather than silently surfacing furniture text as the
    # picture's caption.
    texts = [
        _text_item(
            label="caption",
            text="Furniture caption via children",
            self_ref="#/texts/0",
            content_layer="furniture",
        ),
    ]
    pictures = [
        {
            "self_ref": "#/pictures/0",
            "label": "picture",
            "content_layer": "body",
            "image": {
                "uri": "artifacts/p0.png",
                "mimetype": "image/png",
            },
            "children": [{"$ref": "#/texts/0"}],
            "prov": [],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(body_children=["#/pictures/0"], texts=texts, pictures=pictures),
    )
    (raw_dir / "artifacts").mkdir()
    (raw_dir / "artifacts" / "p0.png").write_bytes(b"\x89PNG fake")
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    block = ir.blocks[0]
    assert block.drawings[0].caption == ""
    assert "Furniture caption via children" not in block.content_template


# ---------------------------------------------------------------------------
# 8. furniture skipped
# ---------------------------------------------------------------------------


def test_docling_adapter_furniture_skipped_by_content_layer(tmp_path: Path) -> None:
    texts = [
        _text_item(label="section_header", text="H", level=1, self_ref="#/texts/0"),
        _text_item(label="text", text="Body sentence.", self_ref="#/texts/1"),
        _text_item(
            label="page_footer",
            text="footer 1/5",
            self_ref="#/texts/2",
            content_layer="furniture",
        ),
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(
            body_children=["#/texts/0", "#/texts/1", "#/texts/2"],
            texts=texts,
        ),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    full = "\n".join(b.content_template for b in ir.blocks)
    assert "footer 1/5" not in full
    # the furniture's prov page_no=1 must not leak into any block position
    for block in ir.blocks:
        for pos in block.positions:
            assert (
                pos.anchor != "1"
                or pos.range is not None
                or any(p.range is not None for p in block.positions)
            )


# ---------------------------------------------------------------------------
# 9. Picture inner children dropped from reading flow
# ---------------------------------------------------------------------------


def test_docling_adapter_picture_children_dropped(tmp_path: Path) -> None:
    texts = [
        _text_item(label="caption", text="Picture caption", self_ref="#/texts/0"),
        _text_item(label="text", text="Inner OCR text 1", self_ref="#/texts/1"),
        _text_item(label="text", text="Inner OCR text 2", self_ref="#/texts/2"),
        _text_item(label="text", text="", self_ref="#/texts/3"),
    ]
    pictures = [
        {
            "self_ref": "#/pictures/0",
            "label": "picture",
            "content_layer": "body",
            "image": {"uri": "artifacts/img.png", "mimetype": "image/png"},
            "children": [
                {"$ref": "#/texts/0"},
                {"$ref": "#/texts/1"},
                {"$ref": "#/texts/2"},
                {"$ref": "#/texts/3"},
            ],
            "prov": [],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(body_children=["#/pictures/0"], texts=texts, pictures=pictures),
    )
    art = raw_dir / "artifacts"
    art.mkdir()
    (art / "img.png").write_bytes(b"png")
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    block = ir.blocks[0]
    drawing = block.drawings[0]
    # caption (label=caption) is taken via children fallback
    assert drawing.caption == "Picture caption"
    assert "Picture caption" not in drawing.extras.get("ocr_texts", "")
    # OCR-only children do NOT appear in body content
    assert "Inner OCR text 1" not in block.content_template
    assert "Inner OCR text 2" not in block.content_template
    # extras records non-empty OCR paragraphs, not raw child refs.
    assert drawing.extras["ocr_texts"] == "Inner OCR text 1\n\nInner OCR text 2"
    assert drawing.extras["ocr_texts_count"] == 2


# ---------------------------------------------------------------------------
# 10. Picture with missing image is skipped
# ---------------------------------------------------------------------------


def test_docling_adapter_picture_missing_image_skipped(tmp_path: Path) -> None:
    pictures = [
        {
            "self_ref": "#/pictures/0",
            "label": "picture",
            "content_layer": "body",
            "image": None,
            "prov": [],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(body_children=["#/pictures/0"], pictures=pictures),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    assert ir.blocks == []
    assert ir.assets == []


def test_docling_adapter_picture_rejects_traversal_uri(tmp_path: Path) -> None:
    # A poisoned bundle JSON points the image URI outside raw_dir via "..".
    # The asset must NOT pick up the outside file — otherwise write_sidecar
    # would copy it into parsed assets, turning a parser-side compromise
    # into arbitrary local-file exfiltration.
    outside = tmp_path / "secret.png"
    outside.write_bytes(b"\x89PNG outside")
    pictures = [
        {
            "self_ref": "#/pictures/0",
            "label": "picture",
            "content_layer": "body",
            "image": {
                "uri": "../secret.png",
                "mimetype": "image/png",
            },
            "prov": [],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(body_children=["#/pictures/0"], pictures=pictures),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    assert ir.blocks == []
    assert ir.assets == []


def test_docling_adapter_picture_rejects_absolute_uri(tmp_path: Path) -> None:
    # ``Path("raw_dir") / "/etc/passwd"`` discards raw_dir on POSIX, so an
    # absolute URI would escape even without a "..". Reject these too.
    outside = tmp_path / "leak.png"
    outside.write_bytes(b"\x89PNG outside")
    pictures = [
        {
            "self_ref": "#/pictures/0",
            "label": "picture",
            "content_layer": "body",
            "image": {
                "uri": str(outside),
                "mimetype": "image/png",
            },
            "prov": [],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(body_children=["#/pictures/0"], pictures=pictures),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    assert ir.blocks == []
    assert ir.assets == []


# ---------------------------------------------------------------------------
# 11. Formula
# ---------------------------------------------------------------------------


def test_docling_adapter_formula_text_equals_orig_still_emits_equation(
    tmp_path: Path,
) -> None:
    texts = [
        {
            "self_ref": "#/texts/0",
            "label": "formula",
            "content_layer": "body",
            "text": "C = 2 * P / X",
            "orig": "C = 2 * P / X",
            "prov": [],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(body_children=["#/texts/0"], texts=texts),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    block = ir.blocks[0]
    assert len(block.equations) == 1
    assert block.equations[0].is_block is True
    assert "C = 2 * P / X" in block.equations[0].latex
    assert "{{EQ:eq1}}" in block.content_template


def test_docling_adapter_formula_with_latex_wraps_dollars(tmp_path: Path) -> None:
    texts = [
        {
            "self_ref": "#/texts/0",
            "label": "formula",
            "content_layer": "body",
            "text": "C = 2 \\cdot P",
            "orig": "<unreadable>",
            "prov": [],
        }
    ]
    raw_dir = _write_doc(
        tmp_path,
        _doc(body_children=["#/texts/0"], texts=texts),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    block = ir.blocks[0]
    assert len(block.equations) == 1
    eq = block.equations[0]
    assert eq.latex.startswith("$$") and eq.latex.endswith("$$")
    assert "C = 2 \\cdot P" in eq.latex
    assert eq.self_ref == "#/texts/0"
    assert "{{EQ:eq1}}" in block.content_template


# ---------------------------------------------------------------------------
# 12. key_value_items / form_items audit
# ---------------------------------------------------------------------------


def test_docling_adapter_kv_form_items_audit_in_split_option(tmp_path: Path) -> None:
    raw_dir = _write_doc(
        tmp_path,
        _doc(
            body_children=[],
            key_value_items=[{"id": "kv1"}, {"id": "kv2"}],
            form_items=[{"id": "f1"}],
        ),
    )
    ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
    extras = ir.split_option["docling_extras"]
    assert extras == {"key_value_items": 2, "form_items": 1}