| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103 |
- """Tests for :class:`DoclingIRBuilder`.
- Each test constructs a minimal inline DoclingDocument dict — the smallest
- JSON that exercises one mapping rule from
- ``docs/DoclingSidecarRefactorPlan-zh.md`` §5. The point is to lock down
- contracts that the integration test (running against the live fixture)
- cannot inspect cleanly, not to faithfully replicate the docling-serve
- output schema.
- """
- from __future__ import annotations
- import json
- from pathlib import Path
- from typing import Any
- import pytest
- from lightrag.parser.external.docling.ir_builder import DoclingIRBuilder
- # ---------------------------------------------------------------------------
- # Helpers to build inline fixtures
- # ---------------------------------------------------------------------------
- def _write_doc(tmp_path: Path, payload: dict, *, stem: str = "demo") -> Path:
- raw_dir = tmp_path / f"{stem}.docling_raw"
- raw_dir.mkdir()
- (raw_dir / f"{stem}.json").write_text(json.dumps(payload), encoding="utf-8")
- return raw_dir
- def _doc(
- *,
- body_children: list[str],
- texts: list[dict] | None = None,
- tables: list[dict] | None = None,
- pictures: list[dict] | None = None,
- groups: list[dict] | None = None,
- key_value_items: list[dict] | None = None,
- form_items: list[dict] | None = None,
- ) -> dict:
- return {
- "schema_name": "DoclingDocument",
- "version": "1.10.0",
- "origin": {"filename": "demo.pdf", "mimetype": "application/pdf"},
- "body": {
- "self_ref": "#/body",
- "children": [{"$ref": r} for r in body_children],
- "content_layer": "body",
- "label": "unspecified",
- },
- "groups": groups or [],
- "texts": texts or [],
- "pictures": pictures or [],
- "tables": tables or [],
- "key_value_items": key_value_items or [],
- "form_items": form_items or [],
- }
- def _text_item(
- *,
- label: str,
- text: str,
- self_ref: str,
- level: int | None = None,
- orig: str | None = None,
- page_no: int = 1,
- bbox: tuple[float, float, float, float] = (10.0, 100.0, 200.0, 80.0),
- coord_origin: str = "BOTTOMLEFT",
- content_layer: str = "body",
- marker: str | None = None,
- ) -> dict:
- item: dict[str, Any] = {
- "self_ref": self_ref,
- "label": label,
- "text": text,
- "orig": orig if orig is not None else text,
- "content_layer": content_layer,
- "prov": [
- {
- "page_no": page_no,
- "bbox": {
- "l": bbox[0],
- "t": bbox[1],
- "r": bbox[2],
- "b": bbox[3],
- "coord_origin": coord_origin,
- },
- "charspan": [0, len(text)],
- }
- ],
- }
- if level is not None:
- item["level"] = level
- if marker is not None:
- item["marker"] = marker
- return item
- @pytest.fixture(autouse=True)
- def _clean_env(monkeypatch: pytest.MonkeyPatch) -> None:
- for name in ("DOCLING_BBOX_ATTRIBUTES", "DOCLING_ENGINE_VERSION"):
- monkeypatch.delenv(name, raising=False)
- # ---------------------------------------------------------------------------
- # 1. Heading hierarchy
- # ---------------------------------------------------------------------------
- def test_docling_adapter_simple_heading_hierarchy(tmp_path: Path) -> None:
- """Three distinct sections without adjacency-merge folding.
- Background and Details each carry their own body, so we end up with one
- block per heading and a clean parent-heading chain.
- """
- texts = [
- _text_item(label="title", text="Whole Doc Title", self_ref="#/texts/0"),
- _text_item(label="text", text="Title-level body.", self_ref="#/texts/1"),
- _text_item(
- label="section_header", text="Background", level=1, self_ref="#/texts/2"
- ),
- _text_item(label="text", text="Some intro body.", self_ref="#/texts/3"),
- _text_item(
- label="section_header", text="Details", level=2, self_ref="#/texts/4"
- ),
- _text_item(label="text", text="Detail content.", self_ref="#/texts/5"),
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(
- body_children=[
- "#/texts/0",
- "#/texts/1",
- "#/texts/2",
- "#/texts/3",
- "#/texts/4",
- "#/texts/5",
- ],
- texts=texts,
- ),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- assert ir.doc_title == "Whole Doc Title"
- headings = [(b.heading, b.level, b.parent_headings) for b in ir.blocks]
- # title (level=1), section_header level=1 → IR level 2, section_header level=2 → IR level 3
- assert headings == [
- ("Whole Doc Title", 1, []),
- ("Background", 2, ["Whole Doc Title"]),
- ("Details", 3, ["Whole Doc Title", "Background"]),
- ]
- # heading line is rendered with markdown prefix as the FIRST line
- assert ir.blocks[0].content_template.splitlines()[0] == "# Whole Doc Title"
- assert ir.blocks[1].content_template.splitlines()[0] == "## Background"
- assert ir.blocks[2].content_template.splitlines()[0] == "### Details"
- def test_docling_adapter_adjacency_merge_folds_empty_heading(tmp_path: Path) -> None:
- """When a heading block has no body and the next heading is deeper,
- the deeper heading folds in as a body line (matches MinerU §5.1.4)."""
- texts = [
- _text_item(label="title", text="Whole Doc Title", self_ref="#/texts/0"),
- _text_item(
- label="section_header", text="Background", level=1, self_ref="#/texts/1"
- ),
- _text_item(label="text", text="Body for Background.", self_ref="#/texts/2"),
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(
- body_children=["#/texts/0", "#/texts/1", "#/texts/2"],
- texts=texts,
- ),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- # Title had no body → Background folded into it as a `## ` line
- assert len(ir.blocks) == 1
- block = ir.blocks[0]
- assert block.heading == "Whole Doc Title"
- assert block.level == 1
- lines = block.content_template.splitlines()
- assert lines[0] == "# Whole Doc Title"
- assert "## Background" in lines
- assert "Body for Background." in lines
- def test_docling_adapter_preserves_docling_heading_level(tmp_path: Path) -> None:
- """When Docling reports all section_headers at level=1, the adapter
- preserves that (no numbering-based level inference)."""
- texts = [
- _text_item(
- label="section_header", text="1 Purpose", level=1, self_ref="#/texts/0"
- ),
- _text_item(
- label="section_header", text="2.1 Electrical", level=1, self_ref="#/texts/1"
- ),
- _text_item(
- label="section_header",
- text="2.4.5 Temperature",
- level=1,
- self_ref="#/texts/2",
- ),
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(
- body_children=["#/texts/0", "#/texts/1", "#/texts/2"],
- texts=texts,
- ),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- levels = [b.level for b in ir.blocks]
- assert levels == [2, 2, 2] # all bumped by +1, no normalization
- # ---------------------------------------------------------------------------
- # 2. Multimodal payloads under one heading
- # ---------------------------------------------------------------------------
- def test_docling_adapter_merges_payloads_under_heading(tmp_path: Path) -> None:
- texts = [
- _text_item(
- label="section_header", text="Section", level=1, self_ref="#/texts/0"
- ),
- _text_item(label="text", text="Inline body line.", self_ref="#/texts/1"),
- ]
- tables = [
- {
- "self_ref": "#/tables/0",
- "label": "table",
- "content_layer": "body",
- "data": {
- "num_rows": 1,
- "num_cols": 2,
- "grid": [[{"text": "A"}, {"text": "B"}]],
- },
- "prov": [],
- }
- ]
- pictures = [
- {
- "self_ref": "#/pictures/0",
- "label": "picture",
- "content_layer": "body",
- "image": {"uri": "artifacts/foo.png", "mimetype": "image/png"},
- "prov": [],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(
- body_children=[
- "#/texts/0",
- "#/texts/1",
- "#/tables/0",
- "#/pictures/0",
- ],
- texts=texts,
- tables=tables,
- pictures=pictures,
- ),
- )
- (raw_dir / "artifacts").mkdir()
- (raw_dir / "artifacts" / "foo.png").write_bytes(b"\x89PNG fake")
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- assert len(ir.blocks) == 1
- block = ir.blocks[0]
- template = block.content_template
- # one of each placeholder appears in source order
- assert "{{TBL:tb1}}" in template
- assert "{{IMG:im2}}" in template
- assert template.index("{{TBL:tb1}}") < template.index("{{IMG:im2}}")
- assert len(block.tables) == 1
- assert block.tables[0].rows == [["A", "B"]]
- assert len(block.drawings) == 1
- assert block.drawings[0].asset_ref == "artifacts/foo.png"
- assert block.drawings[0].fmt == "png"
- assert any(a.ref == "artifacts/foo.png" for a in ir.assets)
- def test_docling_adapter_visits_text_children_for_modalities(
- tmp_path: Path,
- ) -> None:
- texts = [
- _text_item(
- label="section_header",
- text="Section",
- level=1,
- self_ref="#/texts/0",
- ),
- _text_item(label="text", text="Child paragraph.", self_ref="#/texts/1"),
- _text_item(
- label="formula",
- text="E = mc^2",
- orig="E = mc^2",
- self_ref="#/texts/2",
- ),
- ]
- texts[0]["children"] = [
- {"$ref": "#/texts/1"},
- {"$ref": "#/tables/0"},
- {"$ref": "#/pictures/0"},
- {"$ref": "#/texts/2"},
- ]
- tables = [
- {
- "self_ref": "#/tables/0",
- "label": "table",
- "content_layer": "body",
- "data": {
- "num_rows": 1,
- "num_cols": 2,
- "grid": [[{"text": "A"}, {"text": "B"}]],
- },
- "prov": [],
- }
- ]
- pictures = [
- {
- "self_ref": "#/pictures/0",
- "label": "picture",
- "content_layer": "body",
- "image": {"uri": "artifacts/foo.png", "mimetype": "image/png"},
- "prov": [],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(
- body_children=["#/texts/0"],
- texts=texts,
- tables=tables,
- pictures=pictures,
- ),
- )
- (raw_dir / "artifacts").mkdir()
- (raw_dir / "artifacts" / "foo.png").write_bytes(b"\x89PNG fake")
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- assert len(ir.blocks) == 1
- block = ir.blocks[0]
- assert "Child paragraph." in block.content_template
- assert "{{TBL:tb1}}" in block.content_template
- assert "{{IMG:im2}}" in block.content_template
- assert "{{EQ:eq3}}" in block.content_template
- assert len(block.tables) == 1
- assert len(block.drawings) == 1
- assert len(block.equations) == 1
- assert block.equations[0].is_block is True
- # ---------------------------------------------------------------------------
- # 3. Inline groups
- # ---------------------------------------------------------------------------
- def test_docling_adapter_inline_group_joins_children(tmp_path: Path) -> None:
- texts = [
- _text_item(label="section_header", text="S", level=1, self_ref="#/texts/0"),
- _text_item(label="text", text="hello", self_ref="#/texts/1"),
- _text_item(label="text", text="world", self_ref="#/texts/2"),
- ]
- groups = [
- {
- "self_ref": "#/groups/0",
- "label": "inline",
- "content_layer": "body",
- "children": [{"$ref": "#/texts/1"}, {"$ref": "#/texts/2"}],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(
- body_children=["#/texts/0", "#/groups/0"],
- texts=texts,
- groups=groups,
- ),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- assert "hello world" in ir.blocks[0].content_template
- def test_docling_adapter_inline_group_emits_inline_formula(
- tmp_path: Path,
- ) -> None:
- texts = [
- _text_item(label="section_header", text="S", level=1, self_ref="#/texts/0"),
- _text_item(label="text", text="alpha", self_ref="#/texts/1"),
- _text_item(
- label="formula",
- text="x_i",
- orig="x_i",
- self_ref="#/texts/2",
- ),
- _text_item(label="text", text="omega", self_ref="#/texts/3"),
- ]
- groups = [
- {
- "self_ref": "#/groups/0",
- "label": "inline",
- "content_layer": "body",
- "children": [
- {"$ref": "#/texts/1"},
- {"$ref": "#/texts/2"},
- {"$ref": "#/texts/3"},
- ],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(
- body_children=["#/texts/0", "#/groups/0"],
- texts=texts,
- groups=groups,
- ),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- block = ir.blocks[0]
- assert "alpha {{EQI:eq1}} omega" in block.content_template
- assert [eq.is_block for eq in block.equations] == [False]
- assert block.equations[0].latex == "x_i"
- # ---------------------------------------------------------------------------
- # 4. Tables — grid & header
- # ---------------------------------------------------------------------------
- def test_docling_adapter_table_grid_and_header(tmp_path: Path) -> None:
- tables = [
- {
- "self_ref": "#/tables/0",
- "label": "table",
- "content_layer": "body",
- "captions": [{"$ref": "#/texts/0"}],
- "footnotes": [{"$ref": "#/texts/1"}],
- "data": {
- "num_rows": 2,
- "num_cols": 2,
- "grid": [
- [
- {
- "text": "h1",
- "column_header": True,
- "start_row_offset_idx": 0,
- },
- {
- "text": "h2",
- "column_header": True,
- "start_row_offset_idx": 0,
- },
- ],
- [{"text": "a"}, {"text": "b"}],
- ],
- },
- "prov": [],
- }
- ]
- texts = [
- _text_item(label="caption", text="Table caption", self_ref="#/texts/0"),
- _text_item(label="footnote", text="Note: x", self_ref="#/texts/1"),
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(
- body_children=["#/tables/0"],
- texts=texts,
- tables=tables,
- ),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- assert len(ir.blocks) == 1
- table = ir.blocks[0].tables[0]
- assert table.rows == [["h1", "h2"], ["a", "b"]]
- assert table.num_rows == 2
- assert table.num_cols == 2
- assert table.caption == "Table caption"
- assert table.footnotes == ["Note: x"]
- assert table.table_header == [["h1", "h2"]]
- assert table.self_ref == "#/tables/0"
- def test_docling_adapter_empty_table_dropped(tmp_path: Path) -> None:
- """Table items with no usable body MUST NOT enter the IR.
- Docling never populates ``IRTable.html``, so a body-less table would
- land in the sidecar as ``content=""`` and trip the analyze worker's
- "missing table content" path. Mirrors the MinerU-side filter in
- lightrag/parser/external/mineru/ir_builder.py.
- """
- # Four shapes of "no visible content" — all must be dropped.
- tables = [
- # 1) ``data`` missing entirely.
- {"self_ref": "#/tables/0", "label": "table", "content_layer": "body"},
- # 2) Empty grid.
- {
- "self_ref": "#/tables/1",
- "label": "table",
- "content_layer": "body",
- "data": {"num_rows": 0, "num_cols": 0, "grid": []},
- },
- # 3) Grid with only blank cell text.
- {
- "self_ref": "#/tables/2",
- "label": "table",
- "content_layer": "body",
- "data": {
- "num_rows": 1,
- "num_cols": 2,
- "grid": [[{"text": ""}, {"text": " "}]],
- },
- },
- # 4) table_cells fallback yields a blank grid.
- {
- "self_ref": "#/tables/3",
- "label": "table",
- "content_layer": "body",
- "data": {
- "num_rows": 1,
- "num_cols": 1,
- "table_cells": [
- {
- "text": "",
- "start_row_offset_idx": 0,
- "end_row_offset_idx": 1,
- "start_col_offset_idx": 0,
- "end_col_offset_idx": 1,
- }
- ],
- },
- },
- ]
- texts = [_text_item(label="text", text="kept", self_ref="#/texts/0")]
- raw_dir = _write_doc(
- tmp_path,
- _doc(
- body_children=[
- "#/tables/0",
- "#/tables/1",
- "#/tables/2",
- "#/tables/3",
- "#/texts/0",
- ],
- texts=texts,
- tables=tables,
- ),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- table_count = sum(len(b.tables) for b in ir.blocks)
- assert table_count == 0
- joined = "\n".join(b.content_template for b in ir.blocks)
- assert "TBL:" not in joined
- assert "kept" in joined
- def test_docling_adapter_table_extras_is_empty(tmp_path: Path) -> None:
- """`IRTable.extras` is intentionally left blank by the docling adapter:
- the historical ``parent`` / ``children_refs`` / ``references`` /
- ``annotations`` / ``cells`` fields were never consumed downstream and
- bloated ``tables.json`` by ~50%. Construct a table that *would* have
- populated all five legacy fields and assert ``extras == {}``."""
- tables = [
- {
- "self_ref": "#/tables/0",
- "label": "table",
- "content_layer": "body",
- "parent": {"$ref": "#/body"},
- "children": [{"$ref": "#/texts/0"}],
- "references": [{"foo": "bar"}],
- "annotations": [{"note": "x"}],
- "data": {
- "num_rows": 1,
- "num_cols": 1,
- "grid": [[{"text": "x"}]],
- "table_cells": [
- {
- "text": "x",
- "row_span": 1,
- "col_span": 1,
- "start_row_offset_idx": 0,
- "end_row_offset_idx": 1,
- "start_col_offset_idx": 0,
- "end_col_offset_idx": 1,
- "bbox": {"l": 1, "t": 2, "r": 3, "b": 4},
- }
- ],
- },
- "prov": [],
- }
- ]
- texts = [_text_item(label="caption", text="c", self_ref="#/texts/0")]
- raw_dir = _write_doc(
- tmp_path,
- _doc(body_children=["#/tables/0"], texts=texts, tables=tables),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- assert ir.blocks[0].tables[0].extras == {}
- # ---------------------------------------------------------------------------
- # 5. Picture — referenced asset
- # ---------------------------------------------------------------------------
- def test_docling_adapter_picture_referenced_asset(tmp_path: Path) -> None:
- pictures = [
- {
- "self_ref": "#/pictures/0",
- "label": "picture",
- "content_layer": "body",
- "image": {
- "uri": "artifacts/image_000000_abc.png",
- "mimetype": "image/png",
- "size": {"width": 100.0, "height": 200.0},
- },
- "prov": [],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(body_children=["#/pictures/0"], pictures=pictures),
- )
- art = raw_dir / "artifacts"
- art.mkdir()
- asset = art / "image_000000_abc.png"
- asset.write_bytes(b"\x89PNG fake")
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- drawing = ir.blocks[0].drawings[0]
- assert drawing.asset_ref == "artifacts/image_000000_abc.png"
- assert drawing.fmt == "png"
- assert drawing.self_ref == "#/pictures/0"
- [a] = [a for a in ir.assets if a.ref == drawing.asset_ref]
- assert a.source == asset
- assert a.suggested_name == "image_000000_abc.png"
- # intrinsic_size lands in extras for downstream VLM filtering
- assert drawing.extras["intrinsic_size"] == [100.0, 200.0]
- # ---------------------------------------------------------------------------
- # 6. Positions & bbox_attributes
- # ---------------------------------------------------------------------------
- def test_docling_adapter_positions_and_bbox_attributes(tmp_path: Path) -> None:
- texts = [
- _text_item(
- label="text",
- text="A",
- self_ref="#/texts/0",
- page_no=1,
- bbox=(10.0, 100.0, 200.0, 80.0),
- coord_origin="BOTTOMLEFT",
- ),
- _text_item(
- label="text",
- text="B",
- self_ref="#/texts/1",
- page_no=2,
- bbox=(20.0, 50.0, 220.0, 30.0),
- coord_origin="TOPLEFT",
- ),
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(body_children=["#/texts/0", "#/texts/1"], texts=texts),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- assert ir.bbox_attributes == {"origin": "LEFTBOTTOM"}
- # no max / page_sizes leaks
- assert set(ir.bbox_attributes.keys()) == {"origin"}
- positions = ir.blocks[0].positions
- bbox_positions = [p for p in positions if p.range]
- assert len(bbox_positions) == 2
- bl = next(p for p in bbox_positions if p.anchor == "1")
- tl = next(p for p in bbox_positions if p.anchor == "2")
- assert bl.range == [10.0, 100.0, 200.0, 80.0]
- assert bl.origin is None # inherits doc-level LEFTBOTTOM
- assert tl.origin == "LEFTTOP" # per-position override
- assert tl.range == [20.0, 50.0, 220.0, 30.0]
- def test_docling_adapter_bbox_attributes_env_override(
- tmp_path: Path, monkeypatch: pytest.MonkeyPatch
- ) -> None:
- monkeypatch.setenv("DOCLING_BBOX_ATTRIBUTES", '{"origin":"LEFTTOP"}')
- texts = [
- _text_item(
- label="text",
- text="A",
- self_ref="#/texts/0",
- coord_origin="BOTTOMLEFT",
- )
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(body_children=["#/texts/0"], texts=texts),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- assert ir.bbox_attributes == {"origin": "LEFTTOP"}
- # ---------------------------------------------------------------------------
- # 7. caption / footnote refs (positive + sibling-not-consumed)
- # ---------------------------------------------------------------------------
- def test_docling_adapter_caption_refs_only(tmp_path: Path) -> None:
- """The caption referenced by tables[0].captions is consumed (kept in
- IRTable.caption, dropped from reading flow). Sibling text NOT
- referenced — even when it looks like a caption — stays in the reading
- flow."""
- texts = [
- _text_item(label="caption", text="Tab1 caption", self_ref="#/texts/0"),
- _text_item(label="text", text="Tab1 sibling", self_ref="#/texts/1"),
- _text_item(label="caption", text="Orphan caption", self_ref="#/texts/2"),
- ]
- tables = [
- {
- "self_ref": "#/tables/0",
- "label": "table",
- "content_layer": "body",
- "captions": [{"$ref": "#/texts/0"}],
- "data": {"num_rows": 1, "num_cols": 1, "grid": [[{"text": "x"}]]},
- "prov": [],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(
- body_children=["#/tables/0", "#/texts/1", "#/texts/2"],
- texts=texts,
- tables=tables,
- ),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- block = ir.blocks[0]
- assert block.tables[0].caption == "Tab1 caption"
- # consumed caption ref does not leak into body text
- assert "Tab1 caption" not in block.content_template
- # orphan caption and sibling text DO appear in body
- assert "Tab1 sibling" in block.content_template
- assert "Orphan caption" in block.content_template
- def test_docling_adapter_footnotes_refs_only(tmp_path: Path) -> None:
- texts = [
- _text_item(label="footnote", text="Linked footnote", self_ref="#/texts/0"),
- _text_item(label="text", text="注: this is sibling note", self_ref="#/texts/1"),
- ]
- tables = [
- {
- "self_ref": "#/tables/0",
- "label": "table",
- "content_layer": "body",
- "footnotes": [{"$ref": "#/texts/0"}],
- "data": {"num_rows": 1, "num_cols": 1, "grid": [[{"text": "x"}]]},
- "prov": [],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(
- body_children=["#/tables/0", "#/texts/1"],
- texts=texts,
- tables=tables,
- ),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- block = ir.blocks[0]
- assert block.tables[0].footnotes == ["Linked footnote"]
- assert "Linked footnote" not in block.content_template
- assert "注: this is sibling note" in block.content_template
- def test_docling_adapter_table_refs_skip_non_body_caption_footnote(
- tmp_path: Path,
- ) -> None:
- # A body table references a caption/footnote whose targets sit in
- # content_layer="furniture" — typically a page header/footer that
- # docling mislabeled and linked to the table. The adapter contract is
- # that furniture text must never leak into sidecar metadata, so the
- # IRTable's caption/footnotes lists must come back empty (and the body
- # reading flow must not pick up the furniture text either).
- texts = [
- _text_item(
- label="caption",
- text="Page header masquerading as caption",
- self_ref="#/texts/0",
- content_layer="furniture",
- ),
- _text_item(
- label="footnote",
- text="Page footer masquerading as footnote",
- self_ref="#/texts/1",
- content_layer="furniture",
- ),
- ]
- tables = [
- {
- "self_ref": "#/tables/0",
- "label": "table",
- "content_layer": "body",
- "captions": [{"$ref": "#/texts/0"}],
- "footnotes": [{"$ref": "#/texts/1"}],
- "data": {"num_rows": 1, "num_cols": 1, "grid": [[{"text": "x"}]]},
- "prov": [],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(body_children=["#/tables/0"], texts=texts, tables=tables),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- block = ir.blocks[0]
- assert block.tables[0].caption == ""
- assert block.tables[0].footnotes == []
- assert "Page header masquerading" not in block.content_template
- assert "Page footer masquerading" not in block.content_template
- def test_docling_adapter_picture_children_fallback_skips_non_body(
- tmp_path: Path,
- ) -> None:
- # Same invariant for the children fallback path: a body picture has no
- # explicit captions/footnotes, but its ``children`` list refs a caption
- # whose target is furniture. ``_resolve_children_with_label`` must
- # skip it rather than silently surfacing furniture text as the
- # picture's caption.
- texts = [
- _text_item(
- label="caption",
- text="Furniture caption via children",
- self_ref="#/texts/0",
- content_layer="furniture",
- ),
- ]
- pictures = [
- {
- "self_ref": "#/pictures/0",
- "label": "picture",
- "content_layer": "body",
- "image": {
- "uri": "artifacts/p0.png",
- "mimetype": "image/png",
- },
- "children": [{"$ref": "#/texts/0"}],
- "prov": [],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(body_children=["#/pictures/0"], texts=texts, pictures=pictures),
- )
- (raw_dir / "artifacts").mkdir()
- (raw_dir / "artifacts" / "p0.png").write_bytes(b"\x89PNG fake")
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- block = ir.blocks[0]
- assert block.drawings[0].caption == ""
- assert "Furniture caption via children" not in block.content_template
- # ---------------------------------------------------------------------------
- # 8. furniture skipped
- # ---------------------------------------------------------------------------
- def test_docling_adapter_furniture_skipped_by_content_layer(tmp_path: Path) -> None:
- texts = [
- _text_item(label="section_header", text="H", level=1, self_ref="#/texts/0"),
- _text_item(label="text", text="Body sentence.", self_ref="#/texts/1"),
- _text_item(
- label="page_footer",
- text="footer 1/5",
- self_ref="#/texts/2",
- content_layer="furniture",
- ),
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(
- body_children=["#/texts/0", "#/texts/1", "#/texts/2"],
- texts=texts,
- ),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- full = "\n".join(b.content_template for b in ir.blocks)
- assert "footer 1/5" not in full
- # the furniture's prov page_no=1 must not leak into any block position
- for block in ir.blocks:
- for pos in block.positions:
- assert (
- pos.anchor != "1"
- or pos.range is not None
- or any(p.range is not None for p in block.positions)
- )
- # ---------------------------------------------------------------------------
- # 9. Picture inner children dropped from reading flow
- # ---------------------------------------------------------------------------
- def test_docling_adapter_picture_children_dropped(tmp_path: Path) -> None:
- texts = [
- _text_item(label="caption", text="Picture caption", self_ref="#/texts/0"),
- _text_item(label="text", text="Inner OCR text 1", self_ref="#/texts/1"),
- _text_item(label="text", text="Inner OCR text 2", self_ref="#/texts/2"),
- _text_item(label="text", text="", self_ref="#/texts/3"),
- ]
- pictures = [
- {
- "self_ref": "#/pictures/0",
- "label": "picture",
- "content_layer": "body",
- "image": {"uri": "artifacts/img.png", "mimetype": "image/png"},
- "children": [
- {"$ref": "#/texts/0"},
- {"$ref": "#/texts/1"},
- {"$ref": "#/texts/2"},
- {"$ref": "#/texts/3"},
- ],
- "prov": [],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(body_children=["#/pictures/0"], texts=texts, pictures=pictures),
- )
- art = raw_dir / "artifacts"
- art.mkdir()
- (art / "img.png").write_bytes(b"png")
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- block = ir.blocks[0]
- drawing = block.drawings[0]
- # caption (label=caption) is taken via children fallback
- assert drawing.caption == "Picture caption"
- assert "Picture caption" not in drawing.extras.get("ocr_texts", "")
- # OCR-only children do NOT appear in body content
- assert "Inner OCR text 1" not in block.content_template
- assert "Inner OCR text 2" not in block.content_template
- # extras records non-empty OCR paragraphs, not raw child refs.
- assert drawing.extras["ocr_texts"] == "Inner OCR text 1\n\nInner OCR text 2"
- assert drawing.extras["ocr_texts_count"] == 2
- # ---------------------------------------------------------------------------
- # 10. Picture with missing image is skipped
- # ---------------------------------------------------------------------------
- def test_docling_adapter_picture_missing_image_skipped(tmp_path: Path) -> None:
- pictures = [
- {
- "self_ref": "#/pictures/0",
- "label": "picture",
- "content_layer": "body",
- "image": None,
- "prov": [],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(body_children=["#/pictures/0"], pictures=pictures),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- assert ir.blocks == []
- assert ir.assets == []
- def test_docling_adapter_picture_rejects_traversal_uri(tmp_path: Path) -> None:
- # A poisoned bundle JSON points the image URI outside raw_dir via "..".
- # The asset must NOT pick up the outside file — otherwise write_sidecar
- # would copy it into parsed assets, turning a parser-side compromise
- # into arbitrary local-file exfiltration.
- outside = tmp_path / "secret.png"
- outside.write_bytes(b"\x89PNG outside")
- pictures = [
- {
- "self_ref": "#/pictures/0",
- "label": "picture",
- "content_layer": "body",
- "image": {
- "uri": "../secret.png",
- "mimetype": "image/png",
- },
- "prov": [],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(body_children=["#/pictures/0"], pictures=pictures),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- assert ir.blocks == []
- assert ir.assets == []
- def test_docling_adapter_picture_rejects_absolute_uri(tmp_path: Path) -> None:
- # ``Path("raw_dir") / "/etc/passwd"`` discards raw_dir on POSIX, so an
- # absolute URI would escape even without a "..". Reject these too.
- outside = tmp_path / "leak.png"
- outside.write_bytes(b"\x89PNG outside")
- pictures = [
- {
- "self_ref": "#/pictures/0",
- "label": "picture",
- "content_layer": "body",
- "image": {
- "uri": str(outside),
- "mimetype": "image/png",
- },
- "prov": [],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(body_children=["#/pictures/0"], pictures=pictures),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- assert ir.blocks == []
- assert ir.assets == []
- # ---------------------------------------------------------------------------
- # 11. Formula
- # ---------------------------------------------------------------------------
- def test_docling_adapter_formula_text_equals_orig_still_emits_equation(
- tmp_path: Path,
- ) -> None:
- texts = [
- {
- "self_ref": "#/texts/0",
- "label": "formula",
- "content_layer": "body",
- "text": "C = 2 * P / X",
- "orig": "C = 2 * P / X",
- "prov": [],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(body_children=["#/texts/0"], texts=texts),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- block = ir.blocks[0]
- assert len(block.equations) == 1
- assert block.equations[0].is_block is True
- assert "C = 2 * P / X" in block.equations[0].latex
- assert "{{EQ:eq1}}" in block.content_template
- def test_docling_adapter_formula_with_latex_wraps_dollars(tmp_path: Path) -> None:
- texts = [
- {
- "self_ref": "#/texts/0",
- "label": "formula",
- "content_layer": "body",
- "text": "C = 2 \\cdot P",
- "orig": "<unreadable>",
- "prov": [],
- }
- ]
- raw_dir = _write_doc(
- tmp_path,
- _doc(body_children=["#/texts/0"], texts=texts),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- block = ir.blocks[0]
- assert len(block.equations) == 1
- eq = block.equations[0]
- assert eq.latex.startswith("$$") and eq.latex.endswith("$$")
- assert "C = 2 \\cdot P" in eq.latex
- assert eq.self_ref == "#/texts/0"
- assert "{{EQ:eq1}}" in block.content_template
- # ---------------------------------------------------------------------------
- # 12. key_value_items / form_items audit
- # ---------------------------------------------------------------------------
- def test_docling_adapter_kv_form_items_audit_in_split_option(tmp_path: Path) -> None:
- raw_dir = _write_doc(
- tmp_path,
- _doc(
- body_children=[],
- key_value_items=[{"id": "kv1"}, {"id": "kv2"}],
- form_items=[{"id": "f1"}],
- ),
- )
- ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
- extras = ir.split_option["docling_extras"]
- assert extras == {"key_value_items": 2, "form_items": 1}
|