ir_builder.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749
  1. """MinerU IR builder: ``content_list.json`` (+ images/) → :class:`IRDoc`.
  2. Input contract: a ``*.mineru_raw/`` directory containing at least
  3. ``content_list.json``. Optional sibling resources (``images/``,
  4. ``middle.json``, ``full.md``, ``layout.pdf``) are kept as-is; this builder
  5. only reads the content list and image asset bytes.
  6. Conversion rules (informed by spec §3-§六):
  7. - ``text`` items with ``text_level>0`` and ``title`` / ``section_header``
  8. start a NEW block. The heading text is rendered with a markdown ``#``
  9. prefix matching the level (``# foo``, ``## bar`` …) as the first line of
  10. the new block's content.
  11. - All other items (``text``, ``list``, ``code``, ``table``, ``image``,
  12. ``equation``) are MERGED into the current block — their text / placeholder
  13. is appended (newline-separated) to the heading's block. This mirrors the
  14. native docx parser's "split-by-heading, merge-everything-under-heading"
  15. behavior (see ``parser/docx/parse_document.py``).
  16. - Content emitted before the first heading lands in a synthetic
  17. ``Preface/Uncategorized`` block at level 0.
  18. - ``list`` items joined with ``\n``; ``code`` body taken from ``code_body``
  19. if present.
  20. - ``table`` → IRTable + ``{{TBL:k}}`` placeholder. ``table_body`` (HTML) or
  21. the ``rows`` field (2D array) become ``html`` / ``rows`` on IRTable.
  22. ``num_rows`` / ``num_cols`` are taken from MinerU if present, otherwise
  23. inferred. ``header`` populates ``table_header`` (per spec §5).
  24. - ``image`` / ``picture`` / ``drawing`` → IRDrawing + ``{{IMG:k}}`` placeholder.
  25. Asset bytes are referenced via ``img_path`` relative to the raw dir.
  26. - ``equation`` → IREquation. ``is_block`` is decided by whether
  27. ``text_format=="block"`` (MinerU explicit flag) OR ``text_level==0`` with
  28. no inline neighbours; otherwise inline. The latex string is preserved
  29. verbatim (including any ``$$``/``$`` wrappers) so ``blocks.jsonl``'s
  30. ``<equation>`` body matches MinerU's raw output; the writer strips the
  31. wrappers when persisting ``equations.json`` content.
  32. - ``page_idx`` + ``bbox`` → ``IRPosition(type="bbox", anchor=page, range=[x0,y0,x1,y1])``.
  33. Empty/missing bbox is acceptable; positions accumulate on the merged block.
  34. - ``IRDoc.split_option`` records the MinerU engine version when available.
  35. - ``IRDoc.bbox_attributes`` defaults to ``{"origin":"LEFTTOP","max":1000}``
  36. reflecting MinerU's PDF coordinate convention. Operators may override
  37. via ``MINERU_BBOX_ATTRIBUTES`` (JSON string).
  38. """
  39. from __future__ import annotations
  40. import json
  41. import os
  42. from pathlib import Path
  43. from typing import Any
  44. from urllib.parse import urlparse
  45. from lightrag.sidecar.ir import (
  46. AssetSpec,
  47. IRBlock,
  48. IRDoc,
  49. IRDrawing,
  50. IREquation,
  51. IRPosition,
  52. IRTable,
  53. )
  54. from lightrag.utils import logger
  55. PREFACE_HEADING = "Preface/Uncategorized"
  56. CONTENT_LIST_FILENAME = "content_list.json"
  57. class MinerUIRBuilder:
  58. """Stateless except for env-driven config. Reusable across calls."""
  59. def __init__(self) -> None:
  60. self.engine_version = os.getenv("MINERU_ENGINE_VERSION", "").strip()
  61. # Mirror MinerURawClient.__init__: when this is set, the downloader
  62. # stores ALL referenced images (including relative ones) under
  63. # ``images/<basename>``. The builder has to look in the same place.
  64. self.image_url_template = os.getenv("MINERU_IMAGE_URL_TEMPLATE", "").strip()
  65. self.bbox_attributes = self._load_bbox_attributes_env()
  66. def _load_bbox_attributes_env(self) -> dict[str, Any]:
  67. default = {"origin": "LEFTTOP", "max": 1000}
  68. raw = os.getenv("MINERU_BBOX_ATTRIBUTES", "").strip()
  69. if not raw:
  70. return default
  71. try:
  72. parsed = json.loads(raw)
  73. except json.JSONDecodeError as exc:
  74. logger.warning(
  75. "[mineru_ir_builder] MINERU_BBOX_ATTRIBUTES is not valid JSON "
  76. "(%s); falling back to default %s",
  77. exc,
  78. default,
  79. )
  80. return default
  81. if not isinstance(parsed, dict):
  82. logger.warning(
  83. "[mineru_ir_builder] MINERU_BBOX_ATTRIBUTES must decode to a JSON "
  84. "object, got %s; falling back to default %s",
  85. type(parsed).__name__,
  86. default,
  87. )
  88. return default
  89. return parsed
  90. # ------------------------------------------------------------------
  91. # Entry point
  92. # ------------------------------------------------------------------
  93. def normalize_from_workdir(
  94. self,
  95. raw_dir: Path,
  96. *,
  97. document_name: str,
  98. ) -> IRDoc:
  99. """Read ``raw_dir/content_list.json`` and emit an IRDoc.
  100. ``document_name`` is the canonical filename (e.g. ``foo.pdf``) used
  101. for ``meta.document_name``; resolved by the caller from the parser
  102. hint chain.
  103. """
  104. content_list_path = raw_dir / "content_list.json"
  105. if not content_list_path.is_file():
  106. raise FileNotFoundError(
  107. f"MinerU raw bundle missing content_list.json at {raw_dir}"
  108. )
  109. content_list = json.loads(content_list_path.read_text(encoding="utf-8"))
  110. if not isinstance(content_list, list):
  111. raise ValueError(
  112. f"MinerU content_list.json malformed (not a JSON array) at {raw_dir}"
  113. )
  114. return self._normalize_content_list(
  115. content_list, raw_dir, document_name=document_name
  116. )
  117. # ------------------------------------------------------------------
  118. # Core
  119. # ------------------------------------------------------------------
  120. def _normalize_content_list(
  121. self,
  122. content_list: list[Any],
  123. raw_dir: Path,
  124. *,
  125. document_name: str,
  126. ) -> IRDoc:
  127. document_format = Path(document_name).suffix.lower().lstrip(".")
  128. blocks: list[IRBlock] = []
  129. assets: list[AssetSpec] = []
  130. seen_assets: dict[str, str] = {} # ref → suggested_name
  131. doc_title = ""
  132. placeholder_counter = 0
  133. def _next_key(prefix: str) -> str:
  134. nonlocal placeholder_counter
  135. placeholder_counter += 1
  136. return f"{prefix}{placeholder_counter}"
  137. # Heading hierarchy stack — index = level-1 (level 1 lives at [0]).
  138. heading_stack: list[str] = []
  139. # Current-block accumulator. The block is materialized when the next
  140. # heading arrives (or at end-of-document). The initial block is the
  141. # synthetic "Preface/Uncategorized" container at level 0.
  142. cb_lines: list[str] = []
  143. cb_tables: list[IRTable] = []
  144. cb_drawings: list[IRDrawing] = []
  145. cb_equations: list[IREquation] = []
  146. # Positions are split into two channels:
  147. # - ``cb_page_set`` collects ``page_idx`` of bbox-less items; at flush
  148. # each unique page becomes one anchor-only summary ``IRPosition``.
  149. # - ``cb_bbox_positions`` keeps one fine-grained position per item that
  150. # carried a parseable bbox (anchor + range), in source order, with
  151. # no deduplication.
  152. cb_page_set: set[str] = set()
  153. cb_bbox_positions: list[IRPosition] = []
  154. cb_heading = PREFACE_HEADING
  155. cb_level = 0
  156. cb_parents: list[str] = []
  157. # ``cb_has_body`` flips True the moment we accumulate any non-heading
  158. # payload into the current block. While it stays False, an adjacent
  159. # deeper heading is folded into this block as a body line (aligning
  160. # with the native docx parser's behaviour for back-to-back headings).
  161. cb_has_body = False
  162. def _record_position(item: dict) -> None:
  163. """Route an item's positional info into the right channel.
  164. Items with a parseable ``bbox`` produce one fine-grained
  165. IRPosition appended to ``cb_bbox_positions`` (no dedupe).
  166. Otherwise, ``page_idx`` (if any) is added to ``cb_page_set``
  167. and emitted as a single anchor-only summary entry at flush.
  168. """
  169. bbox_pos = _extract_bbox_position(item)
  170. if bbox_pos is not None:
  171. cb_bbox_positions.append(bbox_pos)
  172. return
  173. page = _extract_page_anchor(item)
  174. if page is not None:
  175. cb_page_set.add(page)
  176. def _flush_block() -> None:
  177. """Emit the in-flight block if it carries any content."""
  178. nonlocal cb_lines, cb_tables, cb_drawings, cb_equations
  179. nonlocal cb_page_set, cb_bbox_positions, cb_has_body
  180. has_payload = bool(cb_lines or cb_tables or cb_drawings or cb_equations)
  181. if not has_payload:
  182. return
  183. content = "\n".join(line for line in cb_lines if line)
  184. if not content.strip() and not (cb_tables or cb_drawings or cb_equations):
  185. # Reset and skip — nothing meaningful to emit.
  186. cb_lines = []
  187. cb_page_set = set()
  188. cb_bbox_positions = []
  189. cb_has_body = False
  190. return
  191. positions = [
  192. IRPosition(type="bbox", anchor=p)
  193. for p in _sort_page_anchors(cb_page_set)
  194. ] + list(cb_bbox_positions)
  195. blocks.append(
  196. IRBlock(
  197. content_template=content,
  198. heading=cb_heading,
  199. level=cb_level,
  200. parent_headings=list(cb_parents),
  201. positions=positions,
  202. tables=list(cb_tables),
  203. drawings=list(cb_drawings),
  204. equations=list(cb_equations),
  205. )
  206. )
  207. cb_lines = []
  208. cb_tables = []
  209. cb_drawings = []
  210. cb_equations = []
  211. cb_page_set = set()
  212. cb_bbox_positions = []
  213. cb_has_body = False
  214. def _open_block(heading: str, level: int, parents: list[str]) -> None:
  215. nonlocal cb_heading, cb_level, cb_parents
  216. cb_heading = heading
  217. cb_level = level
  218. cb_parents = parents
  219. # Render the heading line into the block body so the merged
  220. # text reads like markdown (``# Foo`` / ``## Bar`` / …).
  221. md_prefix = "#" * max(level, 1)
  222. cb_lines.append(f"{md_prefix} {heading}")
  223. def _append_text(text: str) -> bool:
  224. """Append ``text`` to the current block body and return whether
  225. anything was actually written. Callers use the return value to
  226. decide whether to also record the item's source position — an
  227. empty text item must NOT leak its ``page_idx`` to the block.
  228. """
  229. nonlocal cb_has_body
  230. if not text:
  231. return False
  232. cb_lines.append(text)
  233. cb_has_body = True
  234. return True
  235. def _merge_heading_as_body(heading: str, level: int) -> None:
  236. """Fold an adjacent deeper heading into the current block.
  237. The line keeps its markdown ``#`` prefix so the rendered block
  238. still reads as ``# Section\n## Subsection``. Does NOT flip
  239. ``cb_has_body`` — successive headings can keep folding until a
  240. real body item lands.
  241. """
  242. md_prefix = "#" * max(level, 1)
  243. cb_lines.append(f"{md_prefix} {heading}")
  244. for item_index, item in enumerate(content_list):
  245. if not isinstance(item, dict):
  246. continue
  247. item_type = str(item.get("type") or item.get("label") or "").lower()
  248. heading_text, heading_level = _detect_heading(item, item_type)
  249. if heading_text:
  250. # Heading hierarchy is updated unconditionally so deeper
  251. # parents resolve correctly once the next real body item
  252. # opens a fresh block.
  253. heading_stack = heading_stack[: max(heading_level - 1, 0)]
  254. parents = [h for h in heading_stack if h]
  255. heading_stack.append(heading_text)
  256. # Adjacency merge: previous block is a real heading with no
  257. # body yet AND the new heading is strictly deeper — append
  258. # this heading as body to the existing block instead of
  259. # flushing. (Preface, level=0, is never merged into.)
  260. if cb_level > 0 and not cb_has_body and heading_level > cb_level:
  261. _merge_heading_as_body(heading_text, heading_level)
  262. _record_position(item)
  263. if not doc_title and heading_level == 1:
  264. doc_title = heading_text
  265. continue
  266. _flush_block()
  267. _open_block(heading_text, heading_level, parents)
  268. _record_position(item)
  269. if not doc_title and heading_level == 1:
  270. doc_title = heading_text
  271. continue
  272. if item_type == "text":
  273. if _append_text(_coerce_text(item)):
  274. _record_position(item)
  275. continue
  276. if item_type == "list":
  277. items = item.get("list_items")
  278. if isinstance(items, list):
  279. text = "\n".join(str(x) for x in items if str(x).strip())
  280. else:
  281. text = _coerce_text(item)
  282. if _append_text(text):
  283. _record_position(item)
  284. continue
  285. if item_type == "code":
  286. if _append_text(item.get("code_body") or _coerce_text(item)):
  287. _record_position(item)
  288. continue
  289. if item_type == "equation":
  290. latex_raw = _coerce_text(item)
  291. if not latex_raw:
  292. # Spec compliance fix: empty equation must not enter sidecar.
  293. continue
  294. # Preserve MinerU's raw latex (including any ``$$``/``$``
  295. # wrappers); the writer strips them when emitting
  296. # equations.json so blocks.jsonl shows the raw form while
  297. # the per-equation sidecar holds clean latex.
  298. latex = latex_raw.strip()
  299. is_block = _is_block_equation(item)
  300. caption = str(item.get("caption") or "")
  301. placeholder = _next_key("eq")
  302. token = "EQ" if is_block else "EQI"
  303. cb_equations.append(
  304. IREquation(
  305. placeholder_key=placeholder,
  306. latex=latex,
  307. is_block=is_block,
  308. caption=caption,
  309. footnotes=_as_str_list(item.get("footnotes")),
  310. self_ref=_content_list_self_ref(item_index) if is_block else "",
  311. )
  312. )
  313. cb_lines.append(f"{{{{{token}:{placeholder}}}}}")
  314. cb_has_body = True
  315. _record_position(item)
  316. continue
  317. if item_type == "table":
  318. table = self._build_ir_table(item)
  319. if table is None:
  320. # Empty body — _build_ir_table already logged the drop.
  321. # Skip placeholder allocation and position recording so
  322. # the misidentified item leaves no trace in the IR.
  323. continue
  324. placeholder = _next_key("tb")
  325. table.placeholder_key = placeholder
  326. table.self_ref = _content_list_self_ref(item_index)
  327. cb_tables.append(table)
  328. cb_lines.append(f"{{{{TBL:{placeholder}}}}}")
  329. cb_has_body = True
  330. _record_position(item)
  331. continue
  332. if item_type in {"image", "picture", "drawing"}:
  333. drawing, asset = self._build_ir_drawing(item, raw_dir, seen_assets)
  334. placeholder = _next_key("im")
  335. drawing.placeholder_key = placeholder
  336. drawing.self_ref = _content_list_self_ref(item_index)
  337. if asset is not None and asset.ref not in {a.ref for a in assets}:
  338. assets.append(asset)
  339. cb_drawings.append(drawing)
  340. cb_lines.append(f"{{{{IMG:{placeholder}}}}}")
  341. cb_has_body = True
  342. _record_position(item)
  343. continue
  344. # Fallback: serialize unknown items as plain text so we don't
  345. # silently drop information. Position only recorded when the
  346. # fallback actually contributed text — empty unknown items must
  347. # not leak their page_idx into the current block.
  348. if _append_text(_coerce_text(item)):
  349. _record_position(item)
  350. _flush_block()
  351. if not doc_title:
  352. doc_title = Path(document_name).stem or document_name
  353. split_option: dict[str, Any] = {}
  354. if self.engine_version:
  355. split_option["engine_version"] = self.engine_version
  356. # Reserved hook for later: detect OCR flag from middle.json / config.
  357. return IRDoc(
  358. document_name=document_name,
  359. document_format=document_format,
  360. doc_title=doc_title,
  361. split_option=split_option,
  362. blocks=blocks,
  363. assets=assets,
  364. bbox_attributes=dict(self.bbox_attributes),
  365. )
  366. # ------------------------------------------------------------------
  367. # Tables / drawings
  368. # ------------------------------------------------------------------
  369. def _build_ir_table(self, item: dict) -> IRTable | None:
  370. rows: list[list[str]] | None = None
  371. html: str | None = None
  372. body_field = item.get("rows")
  373. body = body_field if body_field is not None else item.get("table_body")
  374. if isinstance(body, list):
  375. rows = _normalize_grid(body)
  376. elif isinstance(body, str):
  377. stripped = body.strip()
  378. if stripped.startswith("[") and stripped.endswith("]"):
  379. try:
  380. decoded = json.loads(stripped)
  381. if isinstance(decoded, list):
  382. rows = _normalize_grid(decoded)
  383. except json.JSONDecodeError:
  384. pass
  385. if rows is None:
  386. html = stripped or None
  387. elif isinstance(body, dict):
  388. grid = body.get("grid") or body.get("rows")
  389. if isinstance(grid, list):
  390. rows = _normalize_grid(grid)
  391. else:
  392. html = json.dumps(body, ensure_ascii=False)
  393. # MinerU occasionally emits table items with no usable body (e.g. when
  394. # a page number or blank region is misidentified as a table). Dropping
  395. # them here keeps the sidecar free of items that would later trip the
  396. # analyze worker's "missing table content" hard-failure path.
  397. if not _ir_table_body_has_content(rows, html):
  398. logger.debug(
  399. "[mineru_ir_builder] dropping empty table item "
  400. "(body type=%s, num_rows=%s, num_cols=%s)",
  401. type(body).__name__,
  402. item.get("num_rows"),
  403. item.get("num_cols"),
  404. )
  405. return None
  406. num_rows = int(item.get("num_rows") or (len(rows) if rows else 0) or 0)
  407. num_cols_default = max((len(r) for r in rows), default=0) if rows else 0
  408. num_cols = int(item.get("num_cols") or num_cols_default or 0)
  409. captions = item.get("table_caption")
  410. caption = str(item.get("caption") or "")
  411. if not caption and isinstance(captions, list) and captions:
  412. caption = str(captions[0])
  413. table_header_raw = item.get("header")
  414. table_header: list[list[str]] | None = None
  415. if isinstance(table_header_raw, list) and table_header_raw:
  416. table_header = _normalize_grid(table_header_raw)
  417. return IRTable(
  418. placeholder_key="", # filled by caller
  419. rows=rows,
  420. html=html,
  421. num_rows=num_rows,
  422. num_cols=num_cols,
  423. caption=caption,
  424. footnotes=_as_str_list(item.get("table_footnote") or item.get("footnotes")),
  425. table_header=table_header,
  426. )
  427. def _build_ir_drawing(
  428. self,
  429. item: dict,
  430. raw_dir: Path,
  431. seen: dict[str, str],
  432. ) -> tuple[IRDrawing, AssetSpec | None]:
  433. img_path = str(item.get("img_path") or item.get("path") or "")
  434. src_val = str(item.get("src") or "")
  435. captions = item.get("image_caption") or item.get("captions")
  436. caption = str(item.get("caption") or "")
  437. if not caption and isinstance(captions, list) and captions:
  438. caption = str(captions[0])
  439. fmt = Path(img_path).suffix.lower().lstrip(".") if img_path else ""
  440. if not fmt:
  441. fmt = str(item.get("format") or "")
  442. asset: AssetSpec | None = None
  443. ref = ""
  444. if img_path:
  445. ref = img_path
  446. if ref in seen:
  447. # Already declared by a previous block; reuse name.
  448. pass
  449. else:
  450. # Asset source: file on disk inside raw_dir. ``img_path`` is
  451. # untrusted (it comes from MinerU's content_list.json or a
  452. # downloaded zip), so we go through a safe resolver that
  453. # refuses to escape ``raw_dir`` and mirrors the downloader's
  454. # storage layout for absolute-URL / templated references.
  455. local_path = _safe_local_asset_path(
  456. raw_dir,
  457. img_path,
  458. image_url_template=self.image_url_template,
  459. )
  460. suggested_name = _suggested_asset_name(img_path, fmt, len(seen))
  461. asset = AssetSpec(
  462. ref=ref,
  463. suggested_name=suggested_name,
  464. source=local_path
  465. if local_path is not None and local_path.is_file()
  466. else None,
  467. )
  468. seen[ref] = suggested_name
  469. drawing = IRDrawing(
  470. placeholder_key="", # filled by caller
  471. asset_ref=ref,
  472. fmt=fmt,
  473. caption=caption,
  474. footnotes=_as_str_list(item.get("image_footnote") or item.get("footnotes")),
  475. src=src_val,
  476. )
  477. return drawing, asset
  478. # ----------------------------------------------------------------------
  479. # helpers
  480. # ----------------------------------------------------------------------
  481. def _detect_heading(item: dict, item_type: str) -> tuple[str, int]:
  482. """Return ``(heading_text, level)`` if ``item`` is a heading, else ``("", 0)``.
  483. A heading is either an explicit ``title``/``section_header`` block, or a
  484. ``text`` block whose ``text_level`` is positive (MinerU's convention).
  485. """
  486. if item_type in {"title", "section_header"}:
  487. text = _coerce_text(item).strip()
  488. level = max(int(item.get("text_level") or item.get("level") or 1), 1)
  489. return text, level
  490. if item_type == "text":
  491. try:
  492. tl = int(item.get("text_level") or 0)
  493. except (TypeError, ValueError):
  494. tl = 0
  495. if tl > 0:
  496. return _coerce_text(item).strip(), tl
  497. return "", 0
  498. def _coerce_text(item: dict) -> str:
  499. for key in ("text", "content", "body", "code_body"):
  500. val = item.get(key)
  501. if isinstance(val, str) and val.strip():
  502. return val
  503. return ""
  504. def _as_str_list(value: Any) -> list[str]:
  505. if value is None:
  506. return []
  507. if isinstance(value, list):
  508. return [str(x) for x in value if str(x).strip()]
  509. s = str(value).strip()
  510. return [s] if s else []
  511. def _content_list_self_ref(index: int) -> str:
  512. return f"{CONTENT_LIST_FILENAME}#/{index}"
  513. def _normalize_grid(grid: Any) -> list[list[str]]:
  514. out: list[list[str]] = []
  515. if not isinstance(grid, list):
  516. return out
  517. for row in grid:
  518. if not isinstance(row, list):
  519. continue
  520. out_row: list[str] = []
  521. for cell in row:
  522. if isinstance(cell, dict):
  523. out_row.append(str(cell.get("text", "")).strip())
  524. else:
  525. out_row.append(str(cell).strip())
  526. out.append(out_row)
  527. return out
  528. def _ir_table_body_has_content(rows: list[list[str]] | None, html: str | None) -> bool:
  529. """True iff the parsed table body carries any visible cell text or HTML."""
  530. if html and html.strip():
  531. return True
  532. if rows:
  533. for row in rows:
  534. for cell in row:
  535. if isinstance(cell, str) and cell.strip():
  536. return True
  537. return False
  538. def _is_block_equation(item: dict) -> bool:
  539. """Heuristic: MinerU's ``text_format`` distinguishes block vs inline.
  540. Fallback when absent: treat as block (most MinerU equation items in
  541. PDF context represent display equations); inline equations are usually
  542. embedded inside ``text`` items rather than first-class ``equation``
  543. items.
  544. """
  545. fmt = str(item.get("text_format") or "").lower()
  546. if fmt in {"inline", "inline_equation"}:
  547. return False
  548. if fmt in {"block", "block_equation", "display"}:
  549. return True
  550. return True
  551. def _extract_page_anchor(item: dict) -> str | None:
  552. """Return a 1-based page anchor from MinerU's ``page_idx`` / ``page``.
  553. Always returns a string so ``blocks.jsonl`` carries a uniform anchor
  554. type across Roman / letter / numeric page labels. Integers are bumped
  555. to 1-based (``page_idx=0`` → ``"1"``); strings are stripped and passed
  556. through verbatim. Returns ``None`` when no usable page info is present.
  557. """
  558. page_raw = item.get("page_idx")
  559. if page_raw is None:
  560. page_raw = item.get("page")
  561. if isinstance(page_raw, bool):
  562. # bool is a subclass of int — guard so True/False don't sneak in.
  563. return None
  564. if isinstance(page_raw, int):
  565. return str(page_raw + 1 if page_raw >= 0 else page_raw)
  566. if isinstance(page_raw, str) and page_raw.strip():
  567. return page_raw.strip()
  568. return None
  569. def _sort_page_anchors(pages: set[str]) -> list[str]:
  570. """Order page anchors using book pagination convention.
  571. Non-numeric labels (Roman preface pages ``i``/``ii``/``iv``…, letter
  572. pages like ``A``, ``B-1``) come first in lexical order; numeric labels
  573. follow, sorted by their integer value so ``"2"`` precedes ``"10"``.
  574. Mixing both kinds is safe — the bucketed key avoids the ``TypeError``
  575. that ``sorted({"ii", "1"})`` raises when ints and strings mix.
  576. """
  577. non_numeric = sorted(p for p in pages if not p.isdigit())
  578. numeric = sorted((p for p in pages if p.isdigit()), key=int)
  579. return non_numeric + numeric
  580. def _extract_bbox_position(item: dict) -> IRPosition | None:
  581. """Build a fine-grained ``IRPosition`` when ``bbox`` is parseable.
  582. Returns ``None`` when ``bbox`` is missing or malformed; the caller then
  583. falls back to page-only tracking via :func:`_extract_page_anchor`.
  584. """
  585. bbox = item.get("bbox")
  586. if not isinstance(bbox, (list, tuple)) or len(bbox) < 4:
  587. return None
  588. try:
  589. coords = [float(x) for x in bbox[:4]]
  590. except (TypeError, ValueError):
  591. return None
  592. return IRPosition(type="bbox", anchor=_extract_page_anchor(item), range=coords)
  593. def _safe_local_asset_path(
  594. raw_dir: Path,
  595. img_path: str,
  596. *,
  597. image_url_template: str = "",
  598. ) -> Path | None:
  599. """Resolve ``img_path`` to a concrete file location inside ``raw_dir``.
  600. ``img_path`` comes from MinerU's ``content_list.json`` and is therefore
  601. untrusted. This resolver mirrors :meth:`MinerURawClient._fetch_one_image`
  602. storage rules so the builder always looks where the downloader wrote
  603. the file:
  604. - absolute http(s) URLs and absolute filesystem paths
  605. → ``raw_dir/images/<basename>``;
  606. - any ref when ``MINERU_IMAGE_URL_TEMPLATE`` is configured (the
  607. downloader routes ALL refs — including relative ones — through
  608. :meth:`_image_dest_rel`) → ``raw_dir/images/<basename>``;
  609. - otherwise relative paths resolve under ``raw_dir`` with ``..``
  610. traversal refused and a final ``Path.relative_to`` check.
  611. Returns ``None`` when the candidate is unsafe or cannot be expressed
  612. inside ``raw_dir``. The caller treats ``None`` the same as "file missing"
  613. — the drawing tag still gets written, but no bytes are copied.
  614. """
  615. if not img_path:
  616. return None
  617. if img_path.startswith(("http://", "https://")):
  618. name = Path(urlparse(img_path).path).name
  619. return raw_dir / "images" / name if name else None
  620. if os.path.isabs(img_path):
  621. # Absolute filesystem path in img_path is never trusted to point
  622. # outside raw_dir; mirror the downloader's basename rule.
  623. name = Path(img_path).name
  624. return raw_dir / "images" / name if name else None
  625. if image_url_template:
  626. # Templated mode: downloader stored every ref (incl. relative) at
  627. # images/<basename>, so we must look there too.
  628. name = Path(img_path).name
  629. return raw_dir / "images" / name if name else None
  630. normalized = os.path.normpath(img_path)
  631. if normalized.startswith("..") or os.path.isabs(normalized):
  632. return None
  633. candidate = (raw_dir / normalized).resolve()
  634. try:
  635. candidate.relative_to(raw_dir.resolve())
  636. except ValueError:
  637. return None
  638. return candidate
  639. def _suggested_asset_name(img_path: str, fmt: str, seen_count: int) -> str:
  640. """Pick an in-assets-dir filename for an asset.
  641. For URL refs, use the URL path's basename so we get a useful filename
  642. (``foo.png`` rather than the whole URL). For local refs, the regular
  643. basename. Falls back to ``image-<n>[.fmt]`` when nothing usable.
  644. """
  645. if img_path.startswith(("http://", "https://")):
  646. name = Path(urlparse(img_path).path).name
  647. else:
  648. name = Path(img_path).name
  649. if name:
  650. return name
  651. return f"image-{seen_count + 1}{('.' + fmt) if fmt else ''}"
  652. __all__ = ["MinerUIRBuilder"]