ir_builder.py 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085
  1. """Docling IR builder: ``DoclingDocument`` JSON → :class:`IRDoc`.
  2. Input contract: a ``*.docling_raw/`` directory containing a ``<stem>.json``
  3. produced by docling-serve with ``to_formats=[json,md]`` +
  4. ``image_export_mode=referenced``. Companion ``<stem>.md`` and
  5. ``artifacts/`` are not read by the builder (markdown stays for human
  6. inspection; image bytes are referenced by relative URI).
  7. Conversion rules (informed by
  8. ``docs/DoclingSidecarRefactorPlan-zh.md`` §5):
  9. - **Faithful** mapping. We do NOT correct heading levels from numbering,
  10. do NOT bind orphan ``caption`` / ``footnote`` text to neighbouring
  11. tables/pictures via proximity, do NOT merge continuation tables, do NOT
  12. invent captions or refer to inline neighbours. If docling didn't make
  13. the link, the sidecar doesn't make it either.
  14. - ``content_layer != "body"`` is filtered everywhere (top-level traversal,
  15. group expansion, picture children). Furniture / background never leaks
  16. into blocks, positions, or consumed_refs.
  17. - ``texts[*].label="title"`` → heading level 1; ``"section_header"`` →
  18. Docling ``level + 1`` (default 2 when level missing).
  19. - ``texts[*].label="caption"|"footnote"`` are dropped from the reading
  20. stream **iff** their ref is referenced by a table/picture (via
  21. ``captions`` / ``footnotes`` refs, or as a direct ``children`` ref
  22. whose target is itself a caption/footnote). Otherwise they remain as
  23. regular text in the reading flow.
  24. - ``pictures[*]`` without a usable image reference are skipped instead of
  25. emitting empty-path drawings. ``pictures[*].children`` references that
  26. are NOT caption/footnote are treated as inner-OCR text and excluded from
  27. the reading stream only for pictures that are emitted.
  28. - ``IRPosition`` writes ``origin="LEFTTOP"`` only when the source
  29. ``prov.bbox.coord_origin == "TOPLEFT"``. ``BOTTOMLEFT`` inherits the
  30. doc-level meta (``{"origin":"LEFTBOTTOM"}`` by default). Coordinates
  31. are written verbatim — never flipped.
  32. - ``DOCLING_BBOX_ATTRIBUTES`` env (JSON) can override the doc-level
  33. ``bbox_attributes``, mirroring MinerU's behaviour.
  34. - Equations: ``texts[k].label == "formula"`` is treated as a structural
  35. formula signal whenever text/orig/content is non-empty. Top-level formulas
  36. become block equations; formulas inside inline groups become inline
  37. equations.
  38. """
  39. from __future__ import annotations
  40. import base64
  41. import json
  42. import os
  43. import re
  44. from pathlib import Path
  45. from typing import Any
  46. from lightrag.parser.external._common import env_json
  47. from lightrag.parser.external.docling.manifest import select_main_json
  48. from lightrag.sidecar.ir import (
  49. AssetSpec,
  50. IRBlock,
  51. IRDoc,
  52. IRDrawing,
  53. IREquation,
  54. IRPosition,
  55. IRTable,
  56. )
  57. from lightrag.utils import logger
  58. PREFACE_HEADING = "Preface/Uncategorized"
  59. # Docling JSON Pointer ``#/texts/3``, ``#/tables/2``, ``#/pictures/0``,
  60. # ``#/groups/5``, or ``#/body``.
  61. _REF_PATTERN = re.compile(r"^#/(?P<kind>[a-z_]+)(?:/(?P<index>\d+))?$")
  62. class DoclingIRBuilder:
  63. """Stateless except for env-driven config. Reusable across calls."""
  64. def __init__(self) -> None:
  65. self.engine_version = os.getenv("DOCLING_ENGINE_VERSION", "").strip()
  66. self.bbox_attributes = self._load_bbox_attributes_env()
  67. @staticmethod
  68. def _load_bbox_attributes_env() -> dict[str, Any]:
  69. default = {"origin": "LEFTBOTTOM"}
  70. parsed = env_json("DOCLING_BBOX_ATTRIBUTES", default)
  71. if not isinstance(parsed, dict):
  72. logger.warning(
  73. "[docling_ir_builder] DOCLING_BBOX_ATTRIBUTES must decode to an object; "
  74. "falling back to %s",
  75. default,
  76. )
  77. return dict(default)
  78. return parsed
  79. # ------------------------------------------------------------------
  80. # Entry point
  81. # ------------------------------------------------------------------
  82. def normalize_from_workdir(
  83. self,
  84. raw_dir: Path,
  85. *,
  86. document_name: str,
  87. ) -> IRDoc:
  88. main_json = select_main_json(raw_dir, Path(document_name))
  89. try:
  90. doc = json.loads(main_json.read_text(encoding="utf-8"))
  91. except json.JSONDecodeError as exc:
  92. raise ValueError(
  93. f"Docling raw JSON malformed at {main_json}: {exc}"
  94. ) from exc
  95. if not isinstance(doc, dict):
  96. raise ValueError(f"Docling raw JSON is not an object at {main_json}")
  97. return self._normalize(doc, raw_dir, document_name=document_name)
  98. # ------------------------------------------------------------------
  99. # Core traversal
  100. # ------------------------------------------------------------------
  101. def _normalize(
  102. self,
  103. doc: dict,
  104. raw_dir: Path,
  105. *,
  106. document_name: str,
  107. ) -> IRDoc:
  108. document_format = Path(document_name).suffix.lower().lstrip(".")
  109. ref_index = _build_ref_index(doc)
  110. consumed_refs, picture_inner_refs = _precompute_consumed_refs(doc, raw_dir)
  111. blocks: list[IRBlock] = []
  112. assets: list[AssetSpec] = []
  113. seen_asset_refs: dict[str, str] = {}
  114. doc_title = ""
  115. placeholder_counter = 0
  116. def _next_key(prefix: str) -> str:
  117. nonlocal placeholder_counter
  118. placeholder_counter += 1
  119. return f"{prefix}{placeholder_counter}"
  120. # Heading stack + current block accumulator — identical structure
  121. # to MinerUIRBuilder so downstream P-chunking and provenance behave
  122. # the same way regardless of engine.
  123. heading_stack: list[str] = []
  124. cb_lines: list[str] = []
  125. cb_tables: list[IRTable] = []
  126. cb_drawings: list[IRDrawing] = []
  127. cb_equations: list[IREquation] = []
  128. cb_page_set: set[str] = set()
  129. cb_bbox_positions: list[IRPosition] = []
  130. cb_heading = PREFACE_HEADING
  131. cb_level = 0
  132. cb_parents: list[str] = []
  133. cb_has_body = False
  134. visited: set[str] = set()
  135. kv_count = len(doc.get("key_value_items") or [])
  136. form_count = len(doc.get("form_items") or [])
  137. # --- closures over the accumulator -----------------------------
  138. def _flush_block() -> None:
  139. nonlocal cb_lines, cb_tables, cb_drawings, cb_equations
  140. nonlocal cb_page_set, cb_bbox_positions, cb_has_body
  141. has_payload = bool(cb_lines or cb_tables or cb_drawings or cb_equations)
  142. if not has_payload:
  143. return
  144. content = "\n".join(line for line in cb_lines if line)
  145. if not content.strip() and not (cb_tables or cb_drawings or cb_equations):
  146. cb_lines = []
  147. cb_page_set = set()
  148. cb_bbox_positions = []
  149. cb_has_body = False
  150. return
  151. positions = [
  152. IRPosition(type="bbox", anchor=p)
  153. for p in _sort_page_anchors(cb_page_set)
  154. ] + list(cb_bbox_positions)
  155. blocks.append(
  156. IRBlock(
  157. content_template=content,
  158. heading=cb_heading,
  159. level=cb_level,
  160. parent_headings=list(cb_parents),
  161. positions=positions,
  162. tables=list(cb_tables),
  163. drawings=list(cb_drawings),
  164. equations=list(cb_equations),
  165. )
  166. )
  167. cb_lines = []
  168. cb_tables = []
  169. cb_drawings = []
  170. cb_equations = []
  171. cb_page_set = set()
  172. cb_bbox_positions = []
  173. cb_has_body = False
  174. def _open_block(heading: str, level: int, parents: list[str]) -> None:
  175. nonlocal cb_heading, cb_level, cb_parents
  176. cb_heading = heading
  177. cb_level = level
  178. cb_parents = parents
  179. md_prefix = "#" * max(level, 1)
  180. cb_lines.append(f"{md_prefix} {heading}")
  181. def _merge_heading_as_body(heading: str, level: int) -> None:
  182. md_prefix = "#" * max(level, 1)
  183. cb_lines.append(f"{md_prefix} {heading}")
  184. def _append_text(text: str) -> bool:
  185. nonlocal cb_has_body
  186. if not text:
  187. return False
  188. cb_lines.append(text)
  189. cb_has_body = True
  190. return True
  191. def _record_positions(item: dict) -> None:
  192. for prov in item.get("prov") or []:
  193. if not isinstance(prov, dict):
  194. continue
  195. bbox = prov.get("bbox") or {}
  196. page_raw = prov.get("page_no")
  197. charspan = prov.get("charspan")
  198. if isinstance(bbox, dict) and all(
  199. k in bbox for k in ("l", "t", "r", "b")
  200. ):
  201. coord_origin = str(bbox.get("coord_origin") or "").upper()
  202. origin_override: str | None = None
  203. if coord_origin == "TOPLEFT":
  204. origin_override = "LEFTTOP"
  205. elif coord_origin == "BOTTOMLEFT":
  206. origin_override = None
  207. elif coord_origin:
  208. logger.warning(
  209. "[docling_ir_builder] unknown coord_origin %r; "
  210. "writing through as override",
  211. coord_origin,
  212. )
  213. origin_override = coord_origin
  214. anchor = str(page_raw) if page_raw is not None else None
  215. range_ = [
  216. bbox["l"],
  217. bbox["t"],
  218. bbox["r"],
  219. bbox["b"],
  220. ]
  221. cb_bbox_positions.append(
  222. IRPosition(
  223. type="bbox",
  224. anchor=anchor,
  225. range=range_,
  226. charspan=(
  227. list(charspan) if isinstance(charspan, list) else None
  228. ),
  229. origin=origin_override,
  230. )
  231. )
  232. elif page_raw is not None:
  233. cb_page_set.add(str(page_raw))
  234. # --- main traversal -------------------------------------------
  235. def _visit_ref(ref: str) -> None:
  236. if not ref or ref in consumed_refs or ref in visited:
  237. return
  238. visited.add(ref)
  239. item = ref_index.get(ref)
  240. if item is None:
  241. return
  242. if _content_layer(item) != "body":
  243. return
  244. kind = _ref_kind(ref)
  245. if kind == "groups":
  246. _visit_group(item)
  247. return
  248. if kind == "texts":
  249. _handle_text(item)
  250. return
  251. if kind == "tables":
  252. _handle_table(item)
  253. return
  254. if kind == "pictures":
  255. _handle_picture(item)
  256. return
  257. # Unknown kind — log and ignore; falling through silently would
  258. # hide schema drift in future docling releases.
  259. logger.warning(
  260. "[docling_ir_builder] unknown ref kind %r (ref=%r); skipping", kind, ref
  261. )
  262. def _visit_group(group: dict) -> None:
  263. label = str(group.get("label") or "").lower()
  264. if label not in {
  265. "list",
  266. "inline",
  267. "picture_area",
  268. "section",
  269. "form_area",
  270. "key_value_area",
  271. "ordered_list",
  272. "unordered_list",
  273. "chapter",
  274. }:
  275. logger.warning(
  276. "[docling_ir_builder] unrecognized group label %r; "
  277. "expanding children as default reading order",
  278. label,
  279. )
  280. if label == "inline":
  281. _handle_inline_group(group)
  282. return
  283. _visit_children(group)
  284. def _visit_children(item: dict) -> None:
  285. for child_ref in item.get("children") or []:
  286. ref = _ref_str(child_ref)
  287. _visit_ref(ref)
  288. def _handle_inline_group(group: dict) -> None:
  289. """``inline`` groups concatenate text and inline formulas on one line."""
  290. buf: list[str] = []
  291. pages_recorded = False
  292. for child_ref in group.get("children") or []:
  293. ref = _ref_str(child_ref)
  294. if ref in consumed_refs:
  295. continue
  296. child = ref_index.get(ref)
  297. if not isinstance(child, dict):
  298. continue
  299. if _content_layer(child) != "body":
  300. continue
  301. if _ref_kind(ref) != "texts":
  302. continue
  303. visited.add(ref)
  304. label = str(child.get("label") or "").lower()
  305. piece = (
  306. _make_equation_placeholder(child, is_block=False)
  307. if label == "formula"
  308. else _text_of(child)
  309. )
  310. if piece:
  311. buf.append(piece)
  312. if not pages_recorded:
  313. _record_positions(child)
  314. pages_recorded = True
  315. line = " ".join(buf).strip()
  316. if line:
  317. _append_text(line)
  318. def _handle_text(item: dict) -> None:
  319. nonlocal doc_title, heading_stack, cb_has_body
  320. label = str(item.get("label") or "").lower()
  321. text = _text_of(item).strip()
  322. # Heading?
  323. heading_level = _docling_heading_level(label, item)
  324. if heading_level > 0 and text:
  325. heading_stack = heading_stack[: max(heading_level - 1, 0)]
  326. parents = [h for h in heading_stack if h]
  327. heading_stack.append(text)
  328. # Adjacency merge
  329. if cb_level > 0 and not cb_has_body and heading_level > cb_level:
  330. _merge_heading_as_body(text, heading_level)
  331. _record_positions(item)
  332. if not doc_title and heading_level == 1:
  333. doc_title = text
  334. _visit_children(item)
  335. return
  336. _flush_block()
  337. _open_block(text, heading_level, parents)
  338. _record_positions(item)
  339. if not doc_title and heading_level == 1:
  340. doc_title = text
  341. _visit_children(item)
  342. return
  343. # Formula — Docling's label is the structural signal. For DOCX,
  344. # valid LaTeX may have text == orig, so do not use that equality
  345. # as an enrichment-off heuristic.
  346. if label == "formula":
  347. _handle_formula(item)
  348. _visit_children(item)
  349. return
  350. # list_item: keep the marker if Docling captured one
  351. if label == "list_item":
  352. marker = str(item.get("marker") or "").strip()
  353. line = f"{marker} {text}".strip() if marker else text
  354. if line and _append_text(line):
  355. _record_positions(item)
  356. _visit_children(item)
  357. return
  358. # Caption/footnote not consumed by any table/picture → keep in
  359. # reading flow as ordinary text (preserves original prefixes).
  360. if label in {"caption", "footnote", "text", "code"}:
  361. if _append_text(text):
  362. _record_positions(item)
  363. _visit_children(item)
  364. return
  365. # page_header / page_footer should have been filtered by
  366. # content_layer; reach here only if someone misuses the label.
  367. if label in {"page_header", "page_footer"}:
  368. return
  369. # Unknown label: fall back to writing the text and warn once.
  370. if text:
  371. logger.warning(
  372. "[docling_ir_builder] unknown text label %r; treating as body",
  373. label,
  374. )
  375. if _append_text(text):
  376. _record_positions(item)
  377. _visit_children(item)
  378. def _handle_formula(item: dict) -> None:
  379. placeholder = _make_equation_placeholder(item, is_block=True)
  380. if not placeholder:
  381. return
  382. cb_lines.append(placeholder)
  383. _bump_has_body()
  384. _record_positions(item)
  385. def _make_equation_placeholder(item: dict, *, is_block: bool) -> str:
  386. latex_raw = _text_of(item).strip()
  387. if not latex_raw:
  388. return ""
  389. placeholder = _next_key("eq")
  390. token = "EQ" if is_block else "EQI"
  391. latex = f"$$ {latex_raw} $$" if is_block else latex_raw
  392. cb_equations.append(
  393. IREquation(
  394. placeholder_key=placeholder,
  395. latex=latex,
  396. is_block=is_block,
  397. self_ref=str(item.get("self_ref") or "") if is_block else "",
  398. )
  399. )
  400. return f"{{{{{token}:{placeholder}}}}}"
  401. def _bump_has_body() -> None:
  402. nonlocal cb_has_body
  403. cb_has_body = True
  404. def _handle_table(item: dict) -> None:
  405. table = _build_ir_table(item, ref_index)
  406. if table is None:
  407. # Empty body — _build_ir_table already logged the drop.
  408. # Skip placeholder allocation and position recording so the
  409. # body-less table item leaves no trace in the IR.
  410. return
  411. placeholder = _next_key("tb")
  412. table.placeholder_key = placeholder
  413. cb_tables.append(table)
  414. cb_lines.append(f"{{{{TBL:{placeholder}}}}}")
  415. _bump_has_body()
  416. _record_positions(item)
  417. def _handle_picture(item: dict) -> None:
  418. built = _build_ir_drawing(
  419. item,
  420. ref_index=ref_index,
  421. picture_inner_refs=picture_inner_refs,
  422. raw_dir=raw_dir,
  423. seen_asset_refs=seen_asset_refs,
  424. )
  425. if built is None:
  426. return
  427. drawing, asset = built
  428. placeholder = _next_key("im")
  429. drawing.placeholder_key = placeholder
  430. if asset is not None and asset.ref not in {a.ref for a in assets}:
  431. assets.append(asset)
  432. cb_drawings.append(drawing)
  433. cb_lines.append(f"{{{{IMG:{placeholder}}}}}")
  434. _bump_has_body()
  435. _record_positions(item)
  436. # Kick off traversal from body.children
  437. body = doc.get("body") or {}
  438. for child_ref in body.get("children") or []:
  439. _visit_ref(_ref_str(child_ref))
  440. _flush_block()
  441. if not doc_title:
  442. doc_title = Path(document_name).stem or document_name
  443. split_option: dict[str, Any] = {}
  444. if self.engine_version:
  445. split_option["engine_version"] = self.engine_version
  446. docling_extras: dict[str, Any] = {}
  447. if kv_count:
  448. docling_extras["key_value_items"] = kv_count
  449. if form_count:
  450. docling_extras["form_items"] = form_count
  451. if docling_extras:
  452. split_option["docling_extras"] = docling_extras
  453. return IRDoc(
  454. document_name=document_name,
  455. document_format=document_format,
  456. doc_title=doc_title,
  457. split_option=split_option,
  458. blocks=blocks,
  459. assets=assets,
  460. bbox_attributes=dict(self.bbox_attributes),
  461. )
  462. # ---------------------------------------------------------------------------
  463. # Module-level helpers
  464. # ---------------------------------------------------------------------------
  465. def _ref_str(node: Any) -> str:
  466. """Normalize a Docling reference (``{"$ref": "#/texts/0"}`` or a bare
  467. string) to its string form. Returns ``""`` on garbage input."""
  468. if isinstance(node, str):
  469. return node
  470. if isinstance(node, dict):
  471. v = node.get("$ref") or node.get("ref")
  472. if isinstance(v, str):
  473. return v
  474. return ""
  475. def _ref_kind(ref: str) -> str:
  476. m = _REF_PATTERN.match(ref)
  477. return m.group("kind") if m else ""
  478. def _build_ref_index(doc: dict) -> dict[str, dict]:
  479. """Map every JSON-pointer-style ref to its target object.
  480. Builds entries for ``#/body``, ``#/texts/N``, ``#/tables/N``,
  481. ``#/pictures/N``, ``#/groups/N``. The body object is *not* a
  482. typical content item but we index it so callers don't need a
  483. special case when chasing arbitrary refs.
  484. """
  485. index: dict[str, dict] = {}
  486. body = doc.get("body")
  487. if isinstance(body, dict):
  488. index["#/body"] = body
  489. for key, prefix in (
  490. ("texts", "#/texts/"),
  491. ("tables", "#/tables/"),
  492. ("pictures", "#/pictures/"),
  493. ("groups", "#/groups/"),
  494. ):
  495. items = doc.get(key)
  496. if not isinstance(items, list):
  497. continue
  498. for i, obj in enumerate(items):
  499. if isinstance(obj, dict):
  500. index[f"{prefix}{i}"] = obj
  501. return index
  502. def _precompute_consumed_refs(doc: dict, raw_dir: Path) -> tuple[set[str], set[str]]:
  503. """Return ``(consumed_refs, picture_inner_refs)``.
  504. ``consumed_refs`` enumerates text refs that must NOT enter the reading
  505. stream. The rules below apply only when the owning table/picture is
  506. itself in the body content layer — refs harvested from furniture or
  507. background items are ignored so they do not block legitimate body text
  508. that might be reachable through ``body.children``:
  509. - body ``tables[*].captions`` and ``tables[*].footnotes``
  510. - body ``pictures[*].captions`` and ``pictures[*].footnotes`` only when
  511. the picture has a usable image reference and will be emitted
  512. - body ``tables[*].children`` / ``pictures[*].children`` that resolve
  513. to ``texts[*]`` with ``label="caption"`` or ``"footnote"``
  514. - All body ``pictures[*].children`` that are non-caption/footnote texts
  515. (the picture's inner OCR text). These also land in
  516. ``picture_inner_refs`` so the builder can attribute them to the
  517. drawing's extras.
  518. Sibling text nodes are NOT touched: only refs explicitly linked from a
  519. table/picture object qualify.
  520. """
  521. consumed: set[str] = set()
  522. picture_inner: set[str] = set()
  523. text_label_index: dict[str, str] = {}
  524. for i, obj in enumerate(doc.get("texts") or []):
  525. if isinstance(obj, dict):
  526. text_label_index[f"#/texts/{i}"] = str(obj.get("label") or "").lower()
  527. # Furniture/background tables/pictures must not consume refs that may
  528. # appear under body.children — the builder contract is that non-body
  529. # items are filtered everywhere, including their outgoing refs.
  530. for table in doc.get("tables") or []:
  531. if not isinstance(table, dict):
  532. continue
  533. if _content_layer(table) != "body":
  534. continue
  535. for ref in _iter_refs(table.get("captions")):
  536. consumed.add(ref)
  537. for ref in _iter_refs(table.get("footnotes")):
  538. consumed.add(ref)
  539. for ref in _iter_refs(table.get("children")):
  540. label = text_label_index.get(ref)
  541. if label in {"caption", "footnote"}:
  542. consumed.add(ref)
  543. for pic in doc.get("pictures") or []:
  544. if not isinstance(pic, dict):
  545. continue
  546. if _content_layer(pic) != "body":
  547. continue
  548. if not _has_usable_picture_image(pic, raw_dir):
  549. continue
  550. for ref in _iter_refs(pic.get("captions")):
  551. consumed.add(ref)
  552. for ref in _iter_refs(pic.get("footnotes")):
  553. consumed.add(ref)
  554. for ref in _iter_refs(pic.get("children")):
  555. label = text_label_index.get(ref)
  556. if label in {"caption", "footnote"}:
  557. consumed.add(ref)
  558. elif ref.startswith("#/texts/"):
  559. consumed.add(ref)
  560. picture_inner.add(ref)
  561. return consumed, picture_inner
  562. def _iter_refs(value: Any):
  563. """Yield refs from either a list of ref dicts/strings, or a single one."""
  564. if value is None:
  565. return
  566. if isinstance(value, list):
  567. for item in value:
  568. ref = _ref_str(item)
  569. if ref:
  570. yield ref
  571. else:
  572. ref = _ref_str(value)
  573. if ref:
  574. yield ref
  575. def _content_layer(item: dict) -> str:
  576. return str(item.get("content_layer") or "body").lower()
  577. def _text_of(item: dict) -> str:
  578. for key in ("text", "orig", "content"):
  579. v = item.get(key)
  580. if isinstance(v, str) and v.strip():
  581. return v
  582. return ""
  583. def _docling_heading_level(label: str, item: dict) -> int:
  584. """Map a Docling text item to its IR heading level.
  585. - ``title`` → level 1
  586. - ``section_header`` → ``item.level + 1`` (fallback 2)
  587. Returns 0 when the item is not a heading.
  588. """
  589. if label == "title":
  590. return 1
  591. if label == "section_header":
  592. raw = item.get("level")
  593. try:
  594. level = int(raw)
  595. except (TypeError, ValueError):
  596. level = 0
  597. if level <= 0:
  598. return 2
  599. return level + 1
  600. return 0
  601. def _resolve_text_refs(refs: Any, ref_index: dict[str, dict]) -> list[str]:
  602. """Resolve a list of ``$ref`` entries to their text bodies.
  603. Skips targets whose ``content_layer`` is not ``"body"``. The builder
  604. contract (see module docstring) is that furniture/background items
  605. never leak into sidecar metadata — even when a body table or picture
  606. explicitly references them, because such refs are typically the
  607. consequence of a page-header/footer being mislabeled as a caption.
  608. """
  609. out: list[str] = []
  610. for ref in _iter_refs(refs):
  611. target = ref_index.get(ref)
  612. if not isinstance(target, dict):
  613. continue
  614. if _content_layer(target) != "body":
  615. continue
  616. txt = _text_of(target).strip()
  617. if txt:
  618. out.append(txt)
  619. return out
  620. def _build_ir_table(
  621. item: dict,
  622. ref_index: dict[str, dict],
  623. ) -> IRTable | None:
  624. data = item.get("data") or {}
  625. grid = data.get("grid") if isinstance(data, dict) else None
  626. rows = _rows_from_grid(grid)
  627. if not rows and isinstance(data, dict) and data.get("table_cells"):
  628. rows = _rows_from_table_cells(data)
  629. # Docling never populates IRTable.html, so a table without visible row
  630. # content would land in the sidecar as ``content=""`` and trip the
  631. # analyze worker's "missing table content" path (mirrors the MinerU
  632. # filter in lightrag/parser/external/mineru/ir_builder.py). Drop the
  633. # item up here so the IR stays clean.
  634. if not _table_rows_have_content(rows):
  635. logger.info(
  636. "[docling_ir_builder] dropping empty table item "
  637. "(self_ref=%s, num_rows=%s, num_cols=%s)",
  638. item.get("self_ref"),
  639. data.get("num_rows") if isinstance(data, dict) else None,
  640. data.get("num_cols") if isinstance(data, dict) else None,
  641. )
  642. return None
  643. num_rows = (
  644. int(data.get("num_rows") or len(rows) or 0)
  645. if isinstance(data, dict)
  646. else len(rows)
  647. )
  648. num_cols = int(
  649. (data.get("num_cols") if isinstance(data, dict) else 0)
  650. or (max((len(r) for r in rows), default=0))
  651. )
  652. table_header = _extract_table_header(grid)
  653. captions = _resolve_text_refs(item.get("captions"), ref_index)
  654. if not captions:
  655. # Fallback: direct children with label="caption"
  656. captions = _resolve_children_with_label(
  657. item.get("children"), ref_index, "caption"
  658. )
  659. footnotes = _resolve_text_refs(item.get("footnotes"), ref_index)
  660. if not footnotes:
  661. footnotes = _resolve_children_with_label(
  662. item.get("children"), ref_index, "footnote"
  663. )
  664. return IRTable(
  665. placeholder_key="",
  666. rows=rows or None,
  667. html=None,
  668. num_rows=num_rows,
  669. num_cols=num_cols,
  670. caption=" / ".join(captions),
  671. footnotes=footnotes,
  672. table_header=table_header,
  673. self_ref=str(item.get("self_ref") or ""),
  674. )
  675. def _table_rows_have_content(rows: list[list[str]]) -> bool:
  676. """True iff at least one cell carries visible text."""
  677. for row in rows:
  678. for cell in row:
  679. if isinstance(cell, str) and cell.strip():
  680. return True
  681. return False
  682. def _rows_from_grid(grid: Any) -> list[list[str]]:
  683. out: list[list[str]] = []
  684. if not isinstance(grid, list):
  685. return out
  686. for row in grid:
  687. if not isinstance(row, list):
  688. continue
  689. out.append(
  690. [str((c or {}).get("text", "") if isinstance(c, dict) else c) for c in row]
  691. )
  692. return out
  693. def _rows_from_table_cells(data: dict) -> list[list[str]]:
  694. num_rows = int(data.get("num_rows") or 0)
  695. num_cols = int(data.get("num_cols") or 0)
  696. cells = data.get("table_cells") or []
  697. if num_rows <= 0 or num_cols <= 0 or not isinstance(cells, list):
  698. return []
  699. grid = [[""] * num_cols for _ in range(num_rows)]
  700. for cell in cells:
  701. if not isinstance(cell, dict):
  702. continue
  703. text = str(cell.get("text") or "")
  704. rs = int(cell.get("start_row_offset_idx") or 0)
  705. re_ = int(cell.get("end_row_offset_idx") or rs + 1)
  706. cs = int(cell.get("start_col_offset_idx") or 0)
  707. ce_ = int(cell.get("end_col_offset_idx") or cs + 1)
  708. for r in range(max(rs, 0), min(re_, num_rows)):
  709. for c in range(max(cs, 0), min(ce_, num_cols)):
  710. grid[r][c] = text
  711. return grid
  712. def _extract_table_header(grid: Any) -> list[list[str]] | None:
  713. """Return the contiguous top rows where every cell has
  714. ``column_header=True`` and ``start_row_offset_idx==0`` (the spec calls
  715. out both conditions to defeat false positives from spanning cells).
  716. """
  717. if not isinstance(grid, list):
  718. return None
  719. header_rows: list[list[str]] = []
  720. for row in grid:
  721. if not isinstance(row, list):
  722. break
  723. if (
  724. all(
  725. isinstance(c, dict)
  726. and bool(c.get("column_header"))
  727. and int(c.get("start_row_offset_idx") or 0) == 0
  728. for c in row
  729. )
  730. and row
  731. ):
  732. header_rows.append([str((c or {}).get("text", "")) for c in row])
  733. else:
  734. break
  735. return header_rows or None
  736. def _resolve_children_with_label(
  737. children: Any, ref_index: dict[str, dict], expected_label: str
  738. ) -> list[str]:
  739. out: list[str] = []
  740. for ref in _iter_refs(children):
  741. target = ref_index.get(ref)
  742. if not isinstance(target, dict):
  743. continue
  744. # Same body-only filter as _resolve_text_refs; see its docstring.
  745. if _content_layer(target) != "body":
  746. continue
  747. if str(target.get("label") or "").lower() != expected_label:
  748. continue
  749. txt = _text_of(target).strip()
  750. if txt:
  751. out.append(txt)
  752. return out
  753. def _resolve_picture_ocr_paragraphs(
  754. children: Any, ref_index: dict[str, dict], picture_inner_refs: set[str]
  755. ) -> list[str]:
  756. """Resolve picture OCR child refs into non-empty body-layer paragraphs."""
  757. paragraphs: list[str] = []
  758. for ref in _iter_refs(children):
  759. if ref not in picture_inner_refs:
  760. continue
  761. target = ref_index.get(ref)
  762. if not isinstance(target, dict):
  763. continue
  764. if _content_layer(target) != "body":
  765. continue
  766. txt = _text_of(target).strip()
  767. if txt:
  768. paragraphs.append(txt)
  769. return paragraphs
  770. def _build_ir_drawing(
  771. item: dict,
  772. *,
  773. ref_index: dict[str, dict],
  774. picture_inner_refs: set[str],
  775. raw_dir: Path,
  776. seen_asset_refs: dict[str, str],
  777. ) -> tuple[IRDrawing, AssetSpec | None] | None:
  778. image = item.get("image") or {}
  779. uri = ""
  780. mimetype = ""
  781. image_size: tuple[float, float] | None = None
  782. dpi: Any = None
  783. if isinstance(image, dict):
  784. uri = str(image.get("uri") or "")
  785. mimetype = str(image.get("mimetype") or "")
  786. size = image.get("size") or {}
  787. if isinstance(size, dict) and "width" in size and "height" in size:
  788. image_size = (float(size["width"]), float(size["height"]))
  789. dpi = image.get("dpi")
  790. fmt = _image_fmt_from_mimetype(mimetype) or (
  791. Path(uri).suffix.lstrip(".").lower() if uri else ""
  792. )
  793. captions = _resolve_text_refs(item.get("captions"), ref_index)
  794. if not captions:
  795. captions = _resolve_children_with_label(
  796. item.get("children"), ref_index, "caption"
  797. )
  798. footnotes = _resolve_text_refs(item.get("footnotes"), ref_index)
  799. if not footnotes:
  800. footnotes = _resolve_children_with_label(
  801. item.get("children"), ref_index, "footnote"
  802. )
  803. extras: dict[str, Any] = {}
  804. if image_size is not None:
  805. extras["intrinsic_size"] = list(image_size)
  806. if dpi is not None:
  807. extras["dpi"] = dpi
  808. if mimetype:
  809. extras["mimetype"] = mimetype
  810. if "parent" in item:
  811. extras["parent"] = item.get("parent")
  812. ocr_paragraphs = _resolve_picture_ocr_paragraphs(
  813. item.get("children"), ref_index, picture_inner_refs
  814. )
  815. if ocr_paragraphs:
  816. extras["ocr_texts"] = "\n\n".join(ocr_paragraphs)
  817. extras["ocr_texts_count"] = len(ocr_paragraphs)
  818. if item.get("annotations"):
  819. extras["annotations"] = item.get("annotations")
  820. if item.get("references"):
  821. extras["references"] = item.get("references")
  822. asset_ref = ""
  823. asset: AssetSpec | None = None
  824. path_override: str | None = None
  825. drawing_kwargs: dict[str, Any] = {}
  826. if not uri:
  827. return None
  828. if uri.startswith("data:"):
  829. decoded = _decode_data_uri(uri)
  830. if decoded is not None:
  831. payload, ext = decoded
  832. stem = (
  833. (item.get("self_ref") or "picture").replace("#/", "").replace("/", "_")
  834. )
  835. suggested = f"{stem}.{ext or fmt or 'bin'}"
  836. asset_ref = uri # use the data URI as a stable ref
  837. if asset_ref not in seen_asset_refs:
  838. asset = AssetSpec(
  839. ref=asset_ref,
  840. suggested_name=suggested,
  841. source=payload,
  842. )
  843. seen_asset_refs[asset_ref] = suggested
  844. else:
  845. logger.warning(
  846. "[docling_ir_builder] skipping picture %s because data URI could "
  847. "not be decoded",
  848. item.get("self_ref") or "<unknown>",
  849. )
  850. return None
  851. elif uri.startswith(("http://", "https://")):
  852. path_override = uri
  853. asset_ref = uri
  854. else:
  855. asset_ref = uri
  856. if asset_ref not in seen_asset_refs:
  857. # A malicious/corrupted bundle JSON could point at "../../etc/..."
  858. # or an absolute path; the zip extractor's traversal guard only
  859. # covers member names, not refs embedded in JSON metadata. Resolve
  860. # against raw_dir and require the result to stay inside.
  861. source_path = _resolve_local_image_path(raw_dir, uri)
  862. suggested = Path(uri).name or f"image_{len(seen_asset_refs):06d}"
  863. asset = AssetSpec(
  864. ref=asset_ref,
  865. suggested_name=suggested,
  866. source=source_path if source_path is not None else None,
  867. )
  868. if source_path is None:
  869. logger.warning(
  870. "[docling_ir_builder] skipping picture %s because image URI "
  871. "%r could not be resolved inside %s",
  872. item.get("self_ref") or "<unknown>",
  873. uri,
  874. raw_dir,
  875. )
  876. return None
  877. seen_asset_refs[asset_ref] = suggested
  878. if path_override is not None:
  879. drawing_kwargs["path_override"] = path_override
  880. drawing = IRDrawing(
  881. placeholder_key="",
  882. asset_ref=asset_ref,
  883. fmt=fmt,
  884. caption=" / ".join(captions),
  885. footnotes=footnotes,
  886. src=str(item.get("src") or ""),
  887. self_ref=str(item.get("self_ref") or ""),
  888. extras=extras,
  889. **drawing_kwargs,
  890. )
  891. return drawing, asset
  892. def _image_uri_of(item: dict) -> str:
  893. image = item.get("image")
  894. if not isinstance(image, dict):
  895. return ""
  896. return str(image.get("uri") or "")
  897. def _has_usable_picture_image(item: dict, raw_dir: Path) -> bool:
  898. uri = _image_uri_of(item)
  899. if not uri:
  900. return False
  901. if uri.startswith("data:"):
  902. return _decode_data_uri(uri) is not None
  903. if uri.startswith(("http://", "https://")):
  904. return True
  905. return _resolve_local_image_path(raw_dir, uri) is not None
  906. def _image_fmt_from_mimetype(mimetype: str) -> str:
  907. if not mimetype:
  908. return ""
  909. if mimetype == "image/jpeg":
  910. return "jpg"
  911. if mimetype.startswith("image/"):
  912. return mimetype[len("image/") :].lower()
  913. return ""
  914. def _decode_data_uri(uri: str) -> tuple[bytes, str] | None:
  915. """Decode ``data:image/png;base64,...`` style URIs.
  916. Returns ``(bytes, extension)`` or ``None`` if the payload could not be
  917. decoded. Non-base64 payloads (extremely rare for images) are not
  918. supported and yield ``None``.
  919. """
  920. try:
  921. head, payload = uri.split(",", 1)
  922. except ValueError:
  923. return None
  924. if ";base64" not in head:
  925. return None
  926. try:
  927. data = base64.b64decode(payload, validate=False)
  928. except (ValueError, TypeError):
  929. return None
  930. ext = ""
  931. if head.startswith("data:image/"):
  932. ext = head[len("data:image/") :].split(";", 1)[0].lower()
  933. if ext == "jpeg":
  934. ext = "jpg"
  935. return data, ext
  936. def _resolve_local_image_path(raw_dir: Path, uri: str) -> Path | None:
  937. """Resolve a relative image URI against the bundle root and return it
  938. only if the result is a file *inside* ``raw_dir``.
  939. Returns ``None`` for: absolute URIs (``Path("foo") / "/etc/x"`` discards
  940. the left side and would escape), refs that resolve outside the bundle
  941. (``..``-traversal), and refs whose target does not exist. Symlinks are
  942. followed by ``resolve()`` and the post-resolution path is what's checked,
  943. so a symlink inside the bundle pointing outward is also refused.
  944. """
  945. if not uri or os.path.isabs(uri):
  946. return None
  947. try:
  948. base = raw_dir.resolve(strict=False)
  949. candidate = (raw_dir / uri).resolve(strict=False)
  950. except (OSError, RuntimeError):
  951. return None
  952. try:
  953. candidate.relative_to(base)
  954. except ValueError:
  955. return None
  956. return candidate if candidate.is_file() else None
  957. def _sort_page_anchors(pages: set[str]) -> list[str]:
  958. non_numeric = sorted(p for p in pages if not p.isdigit())
  959. numeric = sorted((p for p in pages if p.isdigit()), key=int)
  960. return non_numeric + numeric
  961. __all__ = ["DoclingIRBuilder"]