test_ir_builder.py 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103
  1. """Tests for :class:`DoclingIRBuilder`.
  2. Each test constructs a minimal inline DoclingDocument dict — the smallest
  3. JSON that exercises one mapping rule from
  4. ``docs/DoclingSidecarRefactorPlan-zh.md`` §5. The point is to lock down
  5. contracts that the integration test (running against the live fixture)
  6. cannot inspect cleanly, not to faithfully replicate the docling-serve
  7. output schema.
  8. """
  9. from __future__ import annotations
  10. import json
  11. from pathlib import Path
  12. from typing import Any
  13. import pytest
  14. from lightrag.parser.external.docling.ir_builder import DoclingIRBuilder
  15. # ---------------------------------------------------------------------------
  16. # Helpers to build inline fixtures
  17. # ---------------------------------------------------------------------------
  18. def _write_doc(tmp_path: Path, payload: dict, *, stem: str = "demo") -> Path:
  19. raw_dir = tmp_path / f"{stem}.docling_raw"
  20. raw_dir.mkdir()
  21. (raw_dir / f"{stem}.json").write_text(json.dumps(payload), encoding="utf-8")
  22. return raw_dir
  23. def _doc(
  24. *,
  25. body_children: list[str],
  26. texts: list[dict] | None = None,
  27. tables: list[dict] | None = None,
  28. pictures: list[dict] | None = None,
  29. groups: list[dict] | None = None,
  30. key_value_items: list[dict] | None = None,
  31. form_items: list[dict] | None = None,
  32. ) -> dict:
  33. return {
  34. "schema_name": "DoclingDocument",
  35. "version": "1.10.0",
  36. "origin": {"filename": "demo.pdf", "mimetype": "application/pdf"},
  37. "body": {
  38. "self_ref": "#/body",
  39. "children": [{"$ref": r} for r in body_children],
  40. "content_layer": "body",
  41. "label": "unspecified",
  42. },
  43. "groups": groups or [],
  44. "texts": texts or [],
  45. "pictures": pictures or [],
  46. "tables": tables or [],
  47. "key_value_items": key_value_items or [],
  48. "form_items": form_items or [],
  49. }
  50. def _text_item(
  51. *,
  52. label: str,
  53. text: str,
  54. self_ref: str,
  55. level: int | None = None,
  56. orig: str | None = None,
  57. page_no: int = 1,
  58. bbox: tuple[float, float, float, float] = (10.0, 100.0, 200.0, 80.0),
  59. coord_origin: str = "BOTTOMLEFT",
  60. content_layer: str = "body",
  61. marker: str | None = None,
  62. ) -> dict:
  63. item: dict[str, Any] = {
  64. "self_ref": self_ref,
  65. "label": label,
  66. "text": text,
  67. "orig": orig if orig is not None else text,
  68. "content_layer": content_layer,
  69. "prov": [
  70. {
  71. "page_no": page_no,
  72. "bbox": {
  73. "l": bbox[0],
  74. "t": bbox[1],
  75. "r": bbox[2],
  76. "b": bbox[3],
  77. "coord_origin": coord_origin,
  78. },
  79. "charspan": [0, len(text)],
  80. }
  81. ],
  82. }
  83. if level is not None:
  84. item["level"] = level
  85. if marker is not None:
  86. item["marker"] = marker
  87. return item
  88. @pytest.fixture(autouse=True)
  89. def _clean_env(monkeypatch: pytest.MonkeyPatch) -> None:
  90. for name in ("DOCLING_BBOX_ATTRIBUTES", "DOCLING_ENGINE_VERSION"):
  91. monkeypatch.delenv(name, raising=False)
  92. # ---------------------------------------------------------------------------
  93. # 1. Heading hierarchy
  94. # ---------------------------------------------------------------------------
  95. def test_docling_adapter_simple_heading_hierarchy(tmp_path: Path) -> None:
  96. """Three distinct sections without adjacency-merge folding.
  97. Background and Details each carry their own body, so we end up with one
  98. block per heading and a clean parent-heading chain.
  99. """
  100. texts = [
  101. _text_item(label="title", text="Whole Doc Title", self_ref="#/texts/0"),
  102. _text_item(label="text", text="Title-level body.", self_ref="#/texts/1"),
  103. _text_item(
  104. label="section_header", text="Background", level=1, self_ref="#/texts/2"
  105. ),
  106. _text_item(label="text", text="Some intro body.", self_ref="#/texts/3"),
  107. _text_item(
  108. label="section_header", text="Details", level=2, self_ref="#/texts/4"
  109. ),
  110. _text_item(label="text", text="Detail content.", self_ref="#/texts/5"),
  111. ]
  112. raw_dir = _write_doc(
  113. tmp_path,
  114. _doc(
  115. body_children=[
  116. "#/texts/0",
  117. "#/texts/1",
  118. "#/texts/2",
  119. "#/texts/3",
  120. "#/texts/4",
  121. "#/texts/5",
  122. ],
  123. texts=texts,
  124. ),
  125. )
  126. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  127. assert ir.doc_title == "Whole Doc Title"
  128. headings = [(b.heading, b.level, b.parent_headings) for b in ir.blocks]
  129. # title (level=1), section_header level=1 → IR level 2, section_header level=2 → IR level 3
  130. assert headings == [
  131. ("Whole Doc Title", 1, []),
  132. ("Background", 2, ["Whole Doc Title"]),
  133. ("Details", 3, ["Whole Doc Title", "Background"]),
  134. ]
  135. # heading line is rendered with markdown prefix as the FIRST line
  136. assert ir.blocks[0].content_template.splitlines()[0] == "# Whole Doc Title"
  137. assert ir.blocks[1].content_template.splitlines()[0] == "## Background"
  138. assert ir.blocks[2].content_template.splitlines()[0] == "### Details"
  139. def test_docling_adapter_adjacency_merge_folds_empty_heading(tmp_path: Path) -> None:
  140. """When a heading block has no body and the next heading is deeper,
  141. the deeper heading folds in as a body line (matches MinerU §5.1.4)."""
  142. texts = [
  143. _text_item(label="title", text="Whole Doc Title", self_ref="#/texts/0"),
  144. _text_item(
  145. label="section_header", text="Background", level=1, self_ref="#/texts/1"
  146. ),
  147. _text_item(label="text", text="Body for Background.", self_ref="#/texts/2"),
  148. ]
  149. raw_dir = _write_doc(
  150. tmp_path,
  151. _doc(
  152. body_children=["#/texts/0", "#/texts/1", "#/texts/2"],
  153. texts=texts,
  154. ),
  155. )
  156. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  157. # Title had no body → Background folded into it as a `## ` line
  158. assert len(ir.blocks) == 1
  159. block = ir.blocks[0]
  160. assert block.heading == "Whole Doc Title"
  161. assert block.level == 1
  162. lines = block.content_template.splitlines()
  163. assert lines[0] == "# Whole Doc Title"
  164. assert "## Background" in lines
  165. assert "Body for Background." in lines
  166. def test_docling_adapter_preserves_docling_heading_level(tmp_path: Path) -> None:
  167. """When Docling reports all section_headers at level=1, the adapter
  168. preserves that (no numbering-based level inference)."""
  169. texts = [
  170. _text_item(
  171. label="section_header", text="1 Purpose", level=1, self_ref="#/texts/0"
  172. ),
  173. _text_item(
  174. label="section_header", text="2.1 Electrical", level=1, self_ref="#/texts/1"
  175. ),
  176. _text_item(
  177. label="section_header",
  178. text="2.4.5 Temperature",
  179. level=1,
  180. self_ref="#/texts/2",
  181. ),
  182. ]
  183. raw_dir = _write_doc(
  184. tmp_path,
  185. _doc(
  186. body_children=["#/texts/0", "#/texts/1", "#/texts/2"],
  187. texts=texts,
  188. ),
  189. )
  190. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  191. levels = [b.level for b in ir.blocks]
  192. assert levels == [2, 2, 2] # all bumped by +1, no normalization
  193. # ---------------------------------------------------------------------------
  194. # 2. Multimodal payloads under one heading
  195. # ---------------------------------------------------------------------------
  196. def test_docling_adapter_merges_payloads_under_heading(tmp_path: Path) -> None:
  197. texts = [
  198. _text_item(
  199. label="section_header", text="Section", level=1, self_ref="#/texts/0"
  200. ),
  201. _text_item(label="text", text="Inline body line.", self_ref="#/texts/1"),
  202. ]
  203. tables = [
  204. {
  205. "self_ref": "#/tables/0",
  206. "label": "table",
  207. "content_layer": "body",
  208. "data": {
  209. "num_rows": 1,
  210. "num_cols": 2,
  211. "grid": [[{"text": "A"}, {"text": "B"}]],
  212. },
  213. "prov": [],
  214. }
  215. ]
  216. pictures = [
  217. {
  218. "self_ref": "#/pictures/0",
  219. "label": "picture",
  220. "content_layer": "body",
  221. "image": {"uri": "artifacts/foo.png", "mimetype": "image/png"},
  222. "prov": [],
  223. }
  224. ]
  225. raw_dir = _write_doc(
  226. tmp_path,
  227. _doc(
  228. body_children=[
  229. "#/texts/0",
  230. "#/texts/1",
  231. "#/tables/0",
  232. "#/pictures/0",
  233. ],
  234. texts=texts,
  235. tables=tables,
  236. pictures=pictures,
  237. ),
  238. )
  239. (raw_dir / "artifacts").mkdir()
  240. (raw_dir / "artifacts" / "foo.png").write_bytes(b"\x89PNG fake")
  241. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  242. assert len(ir.blocks) == 1
  243. block = ir.blocks[0]
  244. template = block.content_template
  245. # one of each placeholder appears in source order
  246. assert "{{TBL:tb1}}" in template
  247. assert "{{IMG:im2}}" in template
  248. assert template.index("{{TBL:tb1}}") < template.index("{{IMG:im2}}")
  249. assert len(block.tables) == 1
  250. assert block.tables[0].rows == [["A", "B"]]
  251. assert len(block.drawings) == 1
  252. assert block.drawings[0].asset_ref == "artifacts/foo.png"
  253. assert block.drawings[0].fmt == "png"
  254. assert any(a.ref == "artifacts/foo.png" for a in ir.assets)
  255. def test_docling_adapter_visits_text_children_for_modalities(
  256. tmp_path: Path,
  257. ) -> None:
  258. texts = [
  259. _text_item(
  260. label="section_header",
  261. text="Section",
  262. level=1,
  263. self_ref="#/texts/0",
  264. ),
  265. _text_item(label="text", text="Child paragraph.", self_ref="#/texts/1"),
  266. _text_item(
  267. label="formula",
  268. text="E = mc^2",
  269. orig="E = mc^2",
  270. self_ref="#/texts/2",
  271. ),
  272. ]
  273. texts[0]["children"] = [
  274. {"$ref": "#/texts/1"},
  275. {"$ref": "#/tables/0"},
  276. {"$ref": "#/pictures/0"},
  277. {"$ref": "#/texts/2"},
  278. ]
  279. tables = [
  280. {
  281. "self_ref": "#/tables/0",
  282. "label": "table",
  283. "content_layer": "body",
  284. "data": {
  285. "num_rows": 1,
  286. "num_cols": 2,
  287. "grid": [[{"text": "A"}, {"text": "B"}]],
  288. },
  289. "prov": [],
  290. }
  291. ]
  292. pictures = [
  293. {
  294. "self_ref": "#/pictures/0",
  295. "label": "picture",
  296. "content_layer": "body",
  297. "image": {"uri": "artifacts/foo.png", "mimetype": "image/png"},
  298. "prov": [],
  299. }
  300. ]
  301. raw_dir = _write_doc(
  302. tmp_path,
  303. _doc(
  304. body_children=["#/texts/0"],
  305. texts=texts,
  306. tables=tables,
  307. pictures=pictures,
  308. ),
  309. )
  310. (raw_dir / "artifacts").mkdir()
  311. (raw_dir / "artifacts" / "foo.png").write_bytes(b"\x89PNG fake")
  312. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  313. assert len(ir.blocks) == 1
  314. block = ir.blocks[0]
  315. assert "Child paragraph." in block.content_template
  316. assert "{{TBL:tb1}}" in block.content_template
  317. assert "{{IMG:im2}}" in block.content_template
  318. assert "{{EQ:eq3}}" in block.content_template
  319. assert len(block.tables) == 1
  320. assert len(block.drawings) == 1
  321. assert len(block.equations) == 1
  322. assert block.equations[0].is_block is True
  323. # ---------------------------------------------------------------------------
  324. # 3. Inline groups
  325. # ---------------------------------------------------------------------------
  326. def test_docling_adapter_inline_group_joins_children(tmp_path: Path) -> None:
  327. texts = [
  328. _text_item(label="section_header", text="S", level=1, self_ref="#/texts/0"),
  329. _text_item(label="text", text="hello", self_ref="#/texts/1"),
  330. _text_item(label="text", text="world", self_ref="#/texts/2"),
  331. ]
  332. groups = [
  333. {
  334. "self_ref": "#/groups/0",
  335. "label": "inline",
  336. "content_layer": "body",
  337. "children": [{"$ref": "#/texts/1"}, {"$ref": "#/texts/2"}],
  338. }
  339. ]
  340. raw_dir = _write_doc(
  341. tmp_path,
  342. _doc(
  343. body_children=["#/texts/0", "#/groups/0"],
  344. texts=texts,
  345. groups=groups,
  346. ),
  347. )
  348. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  349. assert "hello world" in ir.blocks[0].content_template
  350. def test_docling_adapter_inline_group_emits_inline_formula(
  351. tmp_path: Path,
  352. ) -> None:
  353. texts = [
  354. _text_item(label="section_header", text="S", level=1, self_ref="#/texts/0"),
  355. _text_item(label="text", text="alpha", self_ref="#/texts/1"),
  356. _text_item(
  357. label="formula",
  358. text="x_i",
  359. orig="x_i",
  360. self_ref="#/texts/2",
  361. ),
  362. _text_item(label="text", text="omega", self_ref="#/texts/3"),
  363. ]
  364. groups = [
  365. {
  366. "self_ref": "#/groups/0",
  367. "label": "inline",
  368. "content_layer": "body",
  369. "children": [
  370. {"$ref": "#/texts/1"},
  371. {"$ref": "#/texts/2"},
  372. {"$ref": "#/texts/3"},
  373. ],
  374. }
  375. ]
  376. raw_dir = _write_doc(
  377. tmp_path,
  378. _doc(
  379. body_children=["#/texts/0", "#/groups/0"],
  380. texts=texts,
  381. groups=groups,
  382. ),
  383. )
  384. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  385. block = ir.blocks[0]
  386. assert "alpha {{EQI:eq1}} omega" in block.content_template
  387. assert [eq.is_block for eq in block.equations] == [False]
  388. assert block.equations[0].latex == "x_i"
  389. # ---------------------------------------------------------------------------
  390. # 4. Tables — grid & header
  391. # ---------------------------------------------------------------------------
  392. def test_docling_adapter_table_grid_and_header(tmp_path: Path) -> None:
  393. tables = [
  394. {
  395. "self_ref": "#/tables/0",
  396. "label": "table",
  397. "content_layer": "body",
  398. "captions": [{"$ref": "#/texts/0"}],
  399. "footnotes": [{"$ref": "#/texts/1"}],
  400. "data": {
  401. "num_rows": 2,
  402. "num_cols": 2,
  403. "grid": [
  404. [
  405. {
  406. "text": "h1",
  407. "column_header": True,
  408. "start_row_offset_idx": 0,
  409. },
  410. {
  411. "text": "h2",
  412. "column_header": True,
  413. "start_row_offset_idx": 0,
  414. },
  415. ],
  416. [{"text": "a"}, {"text": "b"}],
  417. ],
  418. },
  419. "prov": [],
  420. }
  421. ]
  422. texts = [
  423. _text_item(label="caption", text="Table caption", self_ref="#/texts/0"),
  424. _text_item(label="footnote", text="Note: x", self_ref="#/texts/1"),
  425. ]
  426. raw_dir = _write_doc(
  427. tmp_path,
  428. _doc(
  429. body_children=["#/tables/0"],
  430. texts=texts,
  431. tables=tables,
  432. ),
  433. )
  434. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  435. assert len(ir.blocks) == 1
  436. table = ir.blocks[0].tables[0]
  437. assert table.rows == [["h1", "h2"], ["a", "b"]]
  438. assert table.num_rows == 2
  439. assert table.num_cols == 2
  440. assert table.caption == "Table caption"
  441. assert table.footnotes == ["Note: x"]
  442. assert table.table_header == [["h1", "h2"]]
  443. assert table.self_ref == "#/tables/0"
  444. def test_docling_adapter_empty_table_dropped(tmp_path: Path) -> None:
  445. """Table items with no usable body MUST NOT enter the IR.
  446. Docling never populates ``IRTable.html``, so a body-less table would
  447. land in the sidecar as ``content=""`` and trip the analyze worker's
  448. "missing table content" path. Mirrors the MinerU-side filter in
  449. lightrag/parser/external/mineru/ir_builder.py.
  450. """
  451. # Four shapes of "no visible content" — all must be dropped.
  452. tables = [
  453. # 1) ``data`` missing entirely.
  454. {"self_ref": "#/tables/0", "label": "table", "content_layer": "body"},
  455. # 2) Empty grid.
  456. {
  457. "self_ref": "#/tables/1",
  458. "label": "table",
  459. "content_layer": "body",
  460. "data": {"num_rows": 0, "num_cols": 0, "grid": []},
  461. },
  462. # 3) Grid with only blank cell text.
  463. {
  464. "self_ref": "#/tables/2",
  465. "label": "table",
  466. "content_layer": "body",
  467. "data": {
  468. "num_rows": 1,
  469. "num_cols": 2,
  470. "grid": [[{"text": ""}, {"text": " "}]],
  471. },
  472. },
  473. # 4) table_cells fallback yields a blank grid.
  474. {
  475. "self_ref": "#/tables/3",
  476. "label": "table",
  477. "content_layer": "body",
  478. "data": {
  479. "num_rows": 1,
  480. "num_cols": 1,
  481. "table_cells": [
  482. {
  483. "text": "",
  484. "start_row_offset_idx": 0,
  485. "end_row_offset_idx": 1,
  486. "start_col_offset_idx": 0,
  487. "end_col_offset_idx": 1,
  488. }
  489. ],
  490. },
  491. },
  492. ]
  493. texts = [_text_item(label="text", text="kept", self_ref="#/texts/0")]
  494. raw_dir = _write_doc(
  495. tmp_path,
  496. _doc(
  497. body_children=[
  498. "#/tables/0",
  499. "#/tables/1",
  500. "#/tables/2",
  501. "#/tables/3",
  502. "#/texts/0",
  503. ],
  504. texts=texts,
  505. tables=tables,
  506. ),
  507. )
  508. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  509. table_count = sum(len(b.tables) for b in ir.blocks)
  510. assert table_count == 0
  511. joined = "\n".join(b.content_template for b in ir.blocks)
  512. assert "TBL:" not in joined
  513. assert "kept" in joined
  514. def test_docling_adapter_table_extras_is_empty(tmp_path: Path) -> None:
  515. """`IRTable.extras` is intentionally left blank by the docling adapter:
  516. the historical ``parent`` / ``children_refs`` / ``references`` /
  517. ``annotations`` / ``cells`` fields were never consumed downstream and
  518. bloated ``tables.json`` by ~50%. Construct a table that *would* have
  519. populated all five legacy fields and assert ``extras == {}``."""
  520. tables = [
  521. {
  522. "self_ref": "#/tables/0",
  523. "label": "table",
  524. "content_layer": "body",
  525. "parent": {"$ref": "#/body"},
  526. "children": [{"$ref": "#/texts/0"}],
  527. "references": [{"foo": "bar"}],
  528. "annotations": [{"note": "x"}],
  529. "data": {
  530. "num_rows": 1,
  531. "num_cols": 1,
  532. "grid": [[{"text": "x"}]],
  533. "table_cells": [
  534. {
  535. "text": "x",
  536. "row_span": 1,
  537. "col_span": 1,
  538. "start_row_offset_idx": 0,
  539. "end_row_offset_idx": 1,
  540. "start_col_offset_idx": 0,
  541. "end_col_offset_idx": 1,
  542. "bbox": {"l": 1, "t": 2, "r": 3, "b": 4},
  543. }
  544. ],
  545. },
  546. "prov": [],
  547. }
  548. ]
  549. texts = [_text_item(label="caption", text="c", self_ref="#/texts/0")]
  550. raw_dir = _write_doc(
  551. tmp_path,
  552. _doc(body_children=["#/tables/0"], texts=texts, tables=tables),
  553. )
  554. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  555. assert ir.blocks[0].tables[0].extras == {}
  556. # ---------------------------------------------------------------------------
  557. # 5. Picture — referenced asset
  558. # ---------------------------------------------------------------------------
  559. def test_docling_adapter_picture_referenced_asset(tmp_path: Path) -> None:
  560. pictures = [
  561. {
  562. "self_ref": "#/pictures/0",
  563. "label": "picture",
  564. "content_layer": "body",
  565. "image": {
  566. "uri": "artifacts/image_000000_abc.png",
  567. "mimetype": "image/png",
  568. "size": {"width": 100.0, "height": 200.0},
  569. },
  570. "prov": [],
  571. }
  572. ]
  573. raw_dir = _write_doc(
  574. tmp_path,
  575. _doc(body_children=["#/pictures/0"], pictures=pictures),
  576. )
  577. art = raw_dir / "artifacts"
  578. art.mkdir()
  579. asset = art / "image_000000_abc.png"
  580. asset.write_bytes(b"\x89PNG fake")
  581. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  582. drawing = ir.blocks[0].drawings[0]
  583. assert drawing.asset_ref == "artifacts/image_000000_abc.png"
  584. assert drawing.fmt == "png"
  585. assert drawing.self_ref == "#/pictures/0"
  586. [a] = [a for a in ir.assets if a.ref == drawing.asset_ref]
  587. assert a.source == asset
  588. assert a.suggested_name == "image_000000_abc.png"
  589. # intrinsic_size lands in extras for downstream VLM filtering
  590. assert drawing.extras["intrinsic_size"] == [100.0, 200.0]
  591. # ---------------------------------------------------------------------------
  592. # 6. Positions & bbox_attributes
  593. # ---------------------------------------------------------------------------
  594. def test_docling_adapter_positions_and_bbox_attributes(tmp_path: Path) -> None:
  595. texts = [
  596. _text_item(
  597. label="text",
  598. text="A",
  599. self_ref="#/texts/0",
  600. page_no=1,
  601. bbox=(10.0, 100.0, 200.0, 80.0),
  602. coord_origin="BOTTOMLEFT",
  603. ),
  604. _text_item(
  605. label="text",
  606. text="B",
  607. self_ref="#/texts/1",
  608. page_no=2,
  609. bbox=(20.0, 50.0, 220.0, 30.0),
  610. coord_origin="TOPLEFT",
  611. ),
  612. ]
  613. raw_dir = _write_doc(
  614. tmp_path,
  615. _doc(body_children=["#/texts/0", "#/texts/1"], texts=texts),
  616. )
  617. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  618. assert ir.bbox_attributes == {"origin": "LEFTBOTTOM"}
  619. # no max / page_sizes leaks
  620. assert set(ir.bbox_attributes.keys()) == {"origin"}
  621. positions = ir.blocks[0].positions
  622. bbox_positions = [p for p in positions if p.range]
  623. assert len(bbox_positions) == 2
  624. bl = next(p for p in bbox_positions if p.anchor == "1")
  625. tl = next(p for p in bbox_positions if p.anchor == "2")
  626. assert bl.range == [10.0, 100.0, 200.0, 80.0]
  627. assert bl.origin is None # inherits doc-level LEFTBOTTOM
  628. assert tl.origin == "LEFTTOP" # per-position override
  629. assert tl.range == [20.0, 50.0, 220.0, 30.0]
  630. def test_docling_adapter_bbox_attributes_env_override(
  631. tmp_path: Path, monkeypatch: pytest.MonkeyPatch
  632. ) -> None:
  633. monkeypatch.setenv("DOCLING_BBOX_ATTRIBUTES", '{"origin":"LEFTTOP"}')
  634. texts = [
  635. _text_item(
  636. label="text",
  637. text="A",
  638. self_ref="#/texts/0",
  639. coord_origin="BOTTOMLEFT",
  640. )
  641. ]
  642. raw_dir = _write_doc(
  643. tmp_path,
  644. _doc(body_children=["#/texts/0"], texts=texts),
  645. )
  646. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  647. assert ir.bbox_attributes == {"origin": "LEFTTOP"}
  648. # ---------------------------------------------------------------------------
  649. # 7. caption / footnote refs (positive + sibling-not-consumed)
  650. # ---------------------------------------------------------------------------
  651. def test_docling_adapter_caption_refs_only(tmp_path: Path) -> None:
  652. """The caption referenced by tables[0].captions is consumed (kept in
  653. IRTable.caption, dropped from reading flow). Sibling text NOT
  654. referenced — even when it looks like a caption — stays in the reading
  655. flow."""
  656. texts = [
  657. _text_item(label="caption", text="Tab1 caption", self_ref="#/texts/0"),
  658. _text_item(label="text", text="Tab1 sibling", self_ref="#/texts/1"),
  659. _text_item(label="caption", text="Orphan caption", self_ref="#/texts/2"),
  660. ]
  661. tables = [
  662. {
  663. "self_ref": "#/tables/0",
  664. "label": "table",
  665. "content_layer": "body",
  666. "captions": [{"$ref": "#/texts/0"}],
  667. "data": {"num_rows": 1, "num_cols": 1, "grid": [[{"text": "x"}]]},
  668. "prov": [],
  669. }
  670. ]
  671. raw_dir = _write_doc(
  672. tmp_path,
  673. _doc(
  674. body_children=["#/tables/0", "#/texts/1", "#/texts/2"],
  675. texts=texts,
  676. tables=tables,
  677. ),
  678. )
  679. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  680. block = ir.blocks[0]
  681. assert block.tables[0].caption == "Tab1 caption"
  682. # consumed caption ref does not leak into body text
  683. assert "Tab1 caption" not in block.content_template
  684. # orphan caption and sibling text DO appear in body
  685. assert "Tab1 sibling" in block.content_template
  686. assert "Orphan caption" in block.content_template
  687. def test_docling_adapter_footnotes_refs_only(tmp_path: Path) -> None:
  688. texts = [
  689. _text_item(label="footnote", text="Linked footnote", self_ref="#/texts/0"),
  690. _text_item(label="text", text="注: this is sibling note", self_ref="#/texts/1"),
  691. ]
  692. tables = [
  693. {
  694. "self_ref": "#/tables/0",
  695. "label": "table",
  696. "content_layer": "body",
  697. "footnotes": [{"$ref": "#/texts/0"}],
  698. "data": {"num_rows": 1, "num_cols": 1, "grid": [[{"text": "x"}]]},
  699. "prov": [],
  700. }
  701. ]
  702. raw_dir = _write_doc(
  703. tmp_path,
  704. _doc(
  705. body_children=["#/tables/0", "#/texts/1"],
  706. texts=texts,
  707. tables=tables,
  708. ),
  709. )
  710. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  711. block = ir.blocks[0]
  712. assert block.tables[0].footnotes == ["Linked footnote"]
  713. assert "Linked footnote" not in block.content_template
  714. assert "注: this is sibling note" in block.content_template
  715. def test_docling_adapter_table_refs_skip_non_body_caption_footnote(
  716. tmp_path: Path,
  717. ) -> None:
  718. # A body table references a caption/footnote whose targets sit in
  719. # content_layer="furniture" — typically a page header/footer that
  720. # docling mislabeled and linked to the table. The adapter contract is
  721. # that furniture text must never leak into sidecar metadata, so the
  722. # IRTable's caption/footnotes lists must come back empty (and the body
  723. # reading flow must not pick up the furniture text either).
  724. texts = [
  725. _text_item(
  726. label="caption",
  727. text="Page header masquerading as caption",
  728. self_ref="#/texts/0",
  729. content_layer="furniture",
  730. ),
  731. _text_item(
  732. label="footnote",
  733. text="Page footer masquerading as footnote",
  734. self_ref="#/texts/1",
  735. content_layer="furniture",
  736. ),
  737. ]
  738. tables = [
  739. {
  740. "self_ref": "#/tables/0",
  741. "label": "table",
  742. "content_layer": "body",
  743. "captions": [{"$ref": "#/texts/0"}],
  744. "footnotes": [{"$ref": "#/texts/1"}],
  745. "data": {"num_rows": 1, "num_cols": 1, "grid": [[{"text": "x"}]]},
  746. "prov": [],
  747. }
  748. ]
  749. raw_dir = _write_doc(
  750. tmp_path,
  751. _doc(body_children=["#/tables/0"], texts=texts, tables=tables),
  752. )
  753. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  754. block = ir.blocks[0]
  755. assert block.tables[0].caption == ""
  756. assert block.tables[0].footnotes == []
  757. assert "Page header masquerading" not in block.content_template
  758. assert "Page footer masquerading" not in block.content_template
  759. def test_docling_adapter_picture_children_fallback_skips_non_body(
  760. tmp_path: Path,
  761. ) -> None:
  762. # Same invariant for the children fallback path: a body picture has no
  763. # explicit captions/footnotes, but its ``children`` list refs a caption
  764. # whose target is furniture. ``_resolve_children_with_label`` must
  765. # skip it rather than silently surfacing furniture text as the
  766. # picture's caption.
  767. texts = [
  768. _text_item(
  769. label="caption",
  770. text="Furniture caption via children",
  771. self_ref="#/texts/0",
  772. content_layer="furniture",
  773. ),
  774. ]
  775. pictures = [
  776. {
  777. "self_ref": "#/pictures/0",
  778. "label": "picture",
  779. "content_layer": "body",
  780. "image": {
  781. "uri": "artifacts/p0.png",
  782. "mimetype": "image/png",
  783. },
  784. "children": [{"$ref": "#/texts/0"}],
  785. "prov": [],
  786. }
  787. ]
  788. raw_dir = _write_doc(
  789. tmp_path,
  790. _doc(body_children=["#/pictures/0"], texts=texts, pictures=pictures),
  791. )
  792. (raw_dir / "artifacts").mkdir()
  793. (raw_dir / "artifacts" / "p0.png").write_bytes(b"\x89PNG fake")
  794. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  795. block = ir.blocks[0]
  796. assert block.drawings[0].caption == ""
  797. assert "Furniture caption via children" not in block.content_template
  798. # ---------------------------------------------------------------------------
  799. # 8. furniture skipped
  800. # ---------------------------------------------------------------------------
  801. def test_docling_adapter_furniture_skipped_by_content_layer(tmp_path: Path) -> None:
  802. texts = [
  803. _text_item(label="section_header", text="H", level=1, self_ref="#/texts/0"),
  804. _text_item(label="text", text="Body sentence.", self_ref="#/texts/1"),
  805. _text_item(
  806. label="page_footer",
  807. text="footer 1/5",
  808. self_ref="#/texts/2",
  809. content_layer="furniture",
  810. ),
  811. ]
  812. raw_dir = _write_doc(
  813. tmp_path,
  814. _doc(
  815. body_children=["#/texts/0", "#/texts/1", "#/texts/2"],
  816. texts=texts,
  817. ),
  818. )
  819. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  820. full = "\n".join(b.content_template for b in ir.blocks)
  821. assert "footer 1/5" not in full
  822. # the furniture's prov page_no=1 must not leak into any block position
  823. for block in ir.blocks:
  824. for pos in block.positions:
  825. assert (
  826. pos.anchor != "1"
  827. or pos.range is not None
  828. or any(p.range is not None for p in block.positions)
  829. )
  830. # ---------------------------------------------------------------------------
  831. # 9. Picture inner children dropped from reading flow
  832. # ---------------------------------------------------------------------------
  833. def test_docling_adapter_picture_children_dropped(tmp_path: Path) -> None:
  834. texts = [
  835. _text_item(label="caption", text="Picture caption", self_ref="#/texts/0"),
  836. _text_item(label="text", text="Inner OCR text 1", self_ref="#/texts/1"),
  837. _text_item(label="text", text="Inner OCR text 2", self_ref="#/texts/2"),
  838. _text_item(label="text", text="", self_ref="#/texts/3"),
  839. ]
  840. pictures = [
  841. {
  842. "self_ref": "#/pictures/0",
  843. "label": "picture",
  844. "content_layer": "body",
  845. "image": {"uri": "artifacts/img.png", "mimetype": "image/png"},
  846. "children": [
  847. {"$ref": "#/texts/0"},
  848. {"$ref": "#/texts/1"},
  849. {"$ref": "#/texts/2"},
  850. {"$ref": "#/texts/3"},
  851. ],
  852. "prov": [],
  853. }
  854. ]
  855. raw_dir = _write_doc(
  856. tmp_path,
  857. _doc(body_children=["#/pictures/0"], texts=texts, pictures=pictures),
  858. )
  859. art = raw_dir / "artifacts"
  860. art.mkdir()
  861. (art / "img.png").write_bytes(b"png")
  862. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  863. block = ir.blocks[0]
  864. drawing = block.drawings[0]
  865. # caption (label=caption) is taken via children fallback
  866. assert drawing.caption == "Picture caption"
  867. assert "Picture caption" not in drawing.extras.get("ocr_texts", "")
  868. # OCR-only children do NOT appear in body content
  869. assert "Inner OCR text 1" not in block.content_template
  870. assert "Inner OCR text 2" not in block.content_template
  871. # extras records non-empty OCR paragraphs, not raw child refs.
  872. assert drawing.extras["ocr_texts"] == "Inner OCR text 1\n\nInner OCR text 2"
  873. assert drawing.extras["ocr_texts_count"] == 2
  874. # ---------------------------------------------------------------------------
  875. # 10. Picture with missing image is skipped
  876. # ---------------------------------------------------------------------------
  877. def test_docling_adapter_picture_missing_image_skipped(tmp_path: Path) -> None:
  878. pictures = [
  879. {
  880. "self_ref": "#/pictures/0",
  881. "label": "picture",
  882. "content_layer": "body",
  883. "image": None,
  884. "prov": [],
  885. }
  886. ]
  887. raw_dir = _write_doc(
  888. tmp_path,
  889. _doc(body_children=["#/pictures/0"], pictures=pictures),
  890. )
  891. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  892. assert ir.blocks == []
  893. assert ir.assets == []
  894. def test_docling_adapter_picture_rejects_traversal_uri(tmp_path: Path) -> None:
  895. # A poisoned bundle JSON points the image URI outside raw_dir via "..".
  896. # The asset must NOT pick up the outside file — otherwise write_sidecar
  897. # would copy it into parsed assets, turning a parser-side compromise
  898. # into arbitrary local-file exfiltration.
  899. outside = tmp_path / "secret.png"
  900. outside.write_bytes(b"\x89PNG outside")
  901. pictures = [
  902. {
  903. "self_ref": "#/pictures/0",
  904. "label": "picture",
  905. "content_layer": "body",
  906. "image": {
  907. "uri": "../secret.png",
  908. "mimetype": "image/png",
  909. },
  910. "prov": [],
  911. }
  912. ]
  913. raw_dir = _write_doc(
  914. tmp_path,
  915. _doc(body_children=["#/pictures/0"], pictures=pictures),
  916. )
  917. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  918. assert ir.blocks == []
  919. assert ir.assets == []
  920. def test_docling_adapter_picture_rejects_absolute_uri(tmp_path: Path) -> None:
  921. # ``Path("raw_dir") / "/etc/passwd"`` discards raw_dir on POSIX, so an
  922. # absolute URI would escape even without a "..". Reject these too.
  923. outside = tmp_path / "leak.png"
  924. outside.write_bytes(b"\x89PNG outside")
  925. pictures = [
  926. {
  927. "self_ref": "#/pictures/0",
  928. "label": "picture",
  929. "content_layer": "body",
  930. "image": {
  931. "uri": str(outside),
  932. "mimetype": "image/png",
  933. },
  934. "prov": [],
  935. }
  936. ]
  937. raw_dir = _write_doc(
  938. tmp_path,
  939. _doc(body_children=["#/pictures/0"], pictures=pictures),
  940. )
  941. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  942. assert ir.blocks == []
  943. assert ir.assets == []
  944. # ---------------------------------------------------------------------------
  945. # 11. Formula
  946. # ---------------------------------------------------------------------------
  947. def test_docling_adapter_formula_text_equals_orig_still_emits_equation(
  948. tmp_path: Path,
  949. ) -> None:
  950. texts = [
  951. {
  952. "self_ref": "#/texts/0",
  953. "label": "formula",
  954. "content_layer": "body",
  955. "text": "C = 2 * P / X",
  956. "orig": "C = 2 * P / X",
  957. "prov": [],
  958. }
  959. ]
  960. raw_dir = _write_doc(
  961. tmp_path,
  962. _doc(body_children=["#/texts/0"], texts=texts),
  963. )
  964. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  965. block = ir.blocks[0]
  966. assert len(block.equations) == 1
  967. assert block.equations[0].is_block is True
  968. assert "C = 2 * P / X" in block.equations[0].latex
  969. assert "{{EQ:eq1}}" in block.content_template
  970. def test_docling_adapter_formula_with_latex_wraps_dollars(tmp_path: Path) -> None:
  971. texts = [
  972. {
  973. "self_ref": "#/texts/0",
  974. "label": "formula",
  975. "content_layer": "body",
  976. "text": "C = 2 \\cdot P",
  977. "orig": "<unreadable>",
  978. "prov": [],
  979. }
  980. ]
  981. raw_dir = _write_doc(
  982. tmp_path,
  983. _doc(body_children=["#/texts/0"], texts=texts),
  984. )
  985. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  986. block = ir.blocks[0]
  987. assert len(block.equations) == 1
  988. eq = block.equations[0]
  989. assert eq.latex.startswith("$$") and eq.latex.endswith("$$")
  990. assert "C = 2 \\cdot P" in eq.latex
  991. assert eq.self_ref == "#/texts/0"
  992. assert "{{EQ:eq1}}" in block.content_template
  993. # ---------------------------------------------------------------------------
  994. # 12. key_value_items / form_items audit
  995. # ---------------------------------------------------------------------------
  996. def test_docling_adapter_kv_form_items_audit_in_split_option(tmp_path: Path) -> None:
  997. raw_dir = _write_doc(
  998. tmp_path,
  999. _doc(
  1000. body_children=[],
  1001. key_value_items=[{"id": "kv1"}, {"id": "kv2"}],
  1002. form_items=[{"id": "f1"}],
  1003. ),
  1004. )
  1005. ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
  1006. extras = ir.split_option["docling_extras"]
  1007. assert extras == {"key_value_items": 2, "form_items": 1}