test_ir_builder.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698
  1. """MinerU IR builder tests: content_list.json → IR translation."""
  2. from __future__ import annotations
  3. import json
  4. from pathlib import Path
  5. import pytest
  6. from lightrag.parser.external.mineru import MinerUIRBuilder
  7. def _write_bundle(tmp_path: Path, content_list: list[dict]) -> Path:
  8. """Build a minimal *.mineru_raw/ directory."""
  9. raw = tmp_path / "doc.mineru_raw"
  10. raw.mkdir()
  11. (raw / "content_list.json").write_text(json.dumps(content_list, ensure_ascii=False))
  12. return raw
  13. @pytest.mark.offline
  14. def test_adapter_simple_text_and_heading(tmp_path: Path) -> None:
  15. raw = _write_bundle(
  16. tmp_path,
  17. [
  18. {"type": "text", "text": "1 Introduction", "text_level": 1},
  19. {"type": "text", "text": "Body paragraph."},
  20. {"type": "text", "text": "1.1 Sub", "text_level": 2},
  21. {"type": "text", "text": "Sub body."},
  22. ],
  23. )
  24. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="x.pdf")
  25. assert ir.doc_title == "1 Introduction"
  26. assert ir.document_format == "pdf"
  27. # Heading + body merge into a single block per heading.
  28. assert len(ir.blocks) == 2
  29. assert ir.blocks[0].heading == "1 Introduction"
  30. assert ir.blocks[0].level == 1
  31. # Heading line is rendered with markdown ``#`` prefix matching the level.
  32. assert ir.blocks[0].content_template == "# 1 Introduction\nBody paragraph."
  33. # Sub-heading updates stack and records parent.
  34. assert ir.blocks[1].heading == "1.1 Sub"
  35. assert ir.blocks[1].level == 2
  36. assert ir.blocks[1].parent_headings == ["1 Introduction"]
  37. assert ir.blocks[1].content_template == "## 1.1 Sub\nSub body."
  38. @pytest.mark.offline
  39. def test_adapter_preface_block_for_pre_heading_content(tmp_path: Path) -> None:
  40. """Items emitted before the first heading land in a synthetic
  41. ``Preface/Uncategorized`` block at level 0."""
  42. raw = _write_bundle(
  43. tmp_path,
  44. [
  45. {"type": "text", "text": "Floating intro line."},
  46. {"type": "list", "list_items": ["a", "b"]},
  47. {"type": "text", "text": "Section A", "text_level": 1},
  48. {"type": "text", "text": "A body."},
  49. ],
  50. )
  51. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="p.pdf")
  52. assert len(ir.blocks) == 2
  53. preface = ir.blocks[0]
  54. assert preface.heading == "Preface/Uncategorized"
  55. assert preface.level == 0
  56. assert preface.parent_headings == []
  57. assert preface.content_template == "Floating intro line.\na\nb"
  58. section = ir.blocks[1]
  59. assert section.heading == "Section A"
  60. assert section.level == 1
  61. assert section.content_template == "# Section A\nA body."
  62. @pytest.mark.offline
  63. def test_adapter_merges_mixed_payloads_under_heading(tmp_path: Path) -> None:
  64. """Tables / images / equations / code under the same heading merge into
  65. one block; their placeholders appear in document order."""
  66. raw = _write_bundle(
  67. tmp_path,
  68. [
  69. {"type": "text", "text": "Methods", "text_level": 1},
  70. {"type": "text", "text": "We did stuff."},
  71. {
  72. "type": "table",
  73. "table_body": [["a", "b"], ["1", "2"]],
  74. "num_rows": 2,
  75. "num_cols": 2,
  76. },
  77. {"type": "image", "img_path": "images/fig1.png"},
  78. {"type": "equation", "text": "$$E = mc^2$$"},
  79. {"type": "code", "code_body": "print('ok')"},
  80. ],
  81. )
  82. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="m.pdf")
  83. assert len(ir.blocks) == 1
  84. block = ir.blocks[0]
  85. assert block.heading == "Methods"
  86. assert block.level == 1
  87. assert len(block.tables) == 1
  88. assert len(block.drawings) == 1
  89. assert len(block.equations) == 1
  90. # Lines are joined in source order; the heading carries its ``#`` prefix.
  91. expected_lines = [
  92. "# Methods",
  93. "We did stuff.",
  94. f"{{{{TBL:{block.tables[0].placeholder_key}}}}}",
  95. f"{{{{IMG:{block.drawings[0].placeholder_key}}}}}",
  96. f"{{{{EQ:{block.equations[0].placeholder_key}}}}}",
  97. "print('ok')",
  98. ]
  99. assert block.content_template == "\n".join(expected_lines)
  100. @pytest.mark.offline
  101. def test_adapter_table_and_drawing_and_equation(tmp_path: Path) -> None:
  102. raw = _write_bundle(
  103. tmp_path,
  104. [
  105. {
  106. "type": "table",
  107. "table_body": [["a", "b"], ["1", "2"]],
  108. "num_rows": 2,
  109. "num_cols": 2,
  110. "table_caption": ["Tbl"],
  111. "header": [["a", "b"]],
  112. },
  113. {
  114. "type": "image",
  115. "img_path": "images/img_001.jpg",
  116. "image_caption": ["Fig 1"],
  117. "page_idx": 1,
  118. "bbox": [10, 20, 30, 40],
  119. },
  120. {"type": "equation", "text": "$E = mc^2$", "caption": "Eq 1"},
  121. ],
  122. )
  123. # The drawing references images/img_001.jpg — adapter accepts missing
  124. # files and produces an AssetSpec with source=None.
  125. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="d.pdf")
  126. table_block = next(b for b in ir.blocks if b.tables)
  127. table = table_block.tables[0]
  128. assert table.rows == [["a", "b"], ["1", "2"]]
  129. assert table.num_rows == 2 and table.num_cols == 2
  130. assert table.caption == "Tbl"
  131. assert table.table_header == [["a", "b"]]
  132. assert table.self_ref == "content_list.json#/0"
  133. drawing_block = next(b for b in ir.blocks if b.drawings)
  134. drawing = drawing_block.drawings[0]
  135. assert drawing.fmt == "jpg"
  136. assert drawing.caption == "Fig 1"
  137. assert drawing.self_ref == "content_list.json#/1"
  138. # Position carried through. The bbox-bearing item produces exactly one
  139. # fine-grained position (anchor + range) and is NOT also rolled into the
  140. # page-only summary channel — so the block has a single position entry,
  141. # not a duplicate summary + bbox pair.
  142. assert len(drawing_block.positions) == 1
  143. assert drawing_block.positions[0].type == "bbox"
  144. # Anchor is always serialized as a string (uniform on-disk format,
  145. # accommodates book pagination labels like Roman "ii").
  146. assert drawing_block.positions[0].anchor == "2" # page_idx+1
  147. assert drawing_block.positions[0].range == [10.0, 20.0, 30.0, 40.0]
  148. # Asset is declared with the relative path as ref.
  149. assert any(a.ref == "images/img_001.jpg" for a in ir.assets)
  150. equation_block = next(b for b in ir.blocks if b.equations)
  151. eq = equation_block.equations[0]
  152. # IREquation.latex preserves MinerU's raw form so blocks.jsonl shows it
  153. # verbatim; equations.json strips the ``$`` wrappers downstream (writer).
  154. assert eq.latex == "$E = mc^2$"
  155. assert eq.is_block is True
  156. assert eq.caption == "Eq 1"
  157. assert eq.self_ref == "content_list.json#/2"
  158. @pytest.mark.offline
  159. def test_adapter_page_idx_aggregated_and_deduped_when_no_bbox(
  160. tmp_path: Path,
  161. ) -> None:
  162. """Real MinerU output carries ``page_idx`` on every item but rarely a
  163. ``bbox``. Each unique page contributing to a merged block must surface as
  164. one anchor-only ``{type:"bbox", anchor:<page+1>}`` entry, sorted, no
  165. duplicates, no ``range``.
  166. """
  167. raw = _write_bundle(
  168. tmp_path,
  169. [
  170. {"type": "text", "text": "Section", "text_level": 1, "page_idx": 0},
  171. {"type": "text", "text": "line A", "page_idx": 0},
  172. {"type": "text", "text": "line B", "page_idx": 1},
  173. {"type": "text", "text": "line C", "page_idx": 1},
  174. {"type": "text", "text": "line D", "page_idx": 2},
  175. ],
  176. )
  177. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="p.pdf")
  178. assert len(ir.blocks) == 1
  179. block = ir.blocks[0]
  180. # Pages 0, 1, 2 → anchors "1", "2", "3" — one entry per unique page.
  181. # Anchors are persisted as strings for on-disk uniformity.
  182. assert len(block.positions) == 3
  183. anchors = [p.anchor for p in block.positions]
  184. assert anchors == ["1", "2", "3"]
  185. for pos in block.positions:
  186. assert pos.type == "bbox"
  187. # Page-only summary entries have no range; ``to_jsonable`` must omit
  188. # the key entirely.
  189. assert pos.range is None
  190. assert "range" not in pos.to_jsonable()
  191. @pytest.mark.offline
  192. def test_adapter_bbox_items_and_page_only_items_coexist(tmp_path: Path) -> None:
  193. """When a block merges both bbox-bearing and bbox-less items, the bbox
  194. items are emitted per-item (no dedupe, with ``range``) and only the
  195. bbox-less items contribute to the page-only summary. Ordering: summary
  196. first (sorted by anchor), bbox entries after (source order).
  197. """
  198. raw = _write_bundle(
  199. tmp_path,
  200. [
  201. {"type": "text", "text": "Mixed", "text_level": 1, "page_idx": 1},
  202. {
  203. "type": "image",
  204. "img_path": "images/fig.png",
  205. "page_idx": 1,
  206. "bbox": [10, 20, 30, 40],
  207. },
  208. {"type": "text", "text": "tail line", "page_idx": 2},
  209. ],
  210. )
  211. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="m.pdf")
  212. assert len(ir.blocks) == 1
  213. positions = ir.blocks[0].positions
  214. # One page-only summary for page 3 (the bbox-less tail line) and one
  215. # bbox entry for page 2 (the image). The heading item has page_idx=1
  216. # but no bbox, so it adds anchor 2 to the page set — combined with the
  217. # tail item's anchor 3 the summary section has TWO anchors (1+1, 2+1).
  218. assert [(p.anchor, p.range) for p in positions] == [
  219. ("2", None),
  220. ("3", None),
  221. ("2", [10.0, 20.0, 30.0, 40.0]),
  222. ]
  223. @pytest.mark.offline
  224. def test_adapter_page_sort_books_convention_with_mixed_anchors(
  225. tmp_path: Path,
  226. ) -> None:
  227. """Block merges items with Roman preface labels and Arabic numerals.
  228. Two guarantees:
  229. 1. The adapter must not crash when sorting heterogeneous anchors — a
  230. previous bug surfaced ``TypeError: '<' not supported between
  231. instances of 'str' and 'int'`` whenever ``page_idx`` mixed types.
  232. 2. Output order follows book pagination convention: Roman / letter
  233. labels first (lexical), then numeric pages by integer value, so
  234. ``"2"`` precedes ``"10"`` (not ``"10"`` before ``"2"`` as a naive
  235. lexical sort would do).
  236. """
  237. raw = _write_bundle(
  238. tmp_path,
  239. [
  240. {
  241. "type": "text",
  242. "text": "Mixed Pagination",
  243. "text_level": 1,
  244. "page_idx": "i",
  245. },
  246. {"type": "text", "text": "preface intro", "page_idx": "i"},
  247. {"type": "text", "text": "preface tail", "page_idx": "ii"},
  248. {"type": "text", "text": "chapter line A", "page_idx": 1}, # → "2"
  249. {"type": "text", "text": "chapter line B", "page_idx": 9}, # → "10"
  250. ],
  251. )
  252. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="mix.pdf")
  253. assert len(ir.blocks) == 1
  254. anchors = [p.anchor for p in ir.blocks[0].positions]
  255. # Roman labels first (lex order), then numerics by int value.
  256. assert anchors == ["i", "ii", "2", "10"]
  257. @pytest.mark.offline
  258. def test_adapter_empty_text_item_does_not_leak_page_to_block(
  259. tmp_path: Path,
  260. ) -> None:
  261. """An item whose body is empty must NOT contribute its ``page_idx`` to
  262. the current block's positions — otherwise spurious pages from
  263. content-less items poison provenance.
  264. Regression: empty text on page 99 sits between two real headings; its
  265. page must not appear under either block.
  266. """
  267. raw = _write_bundle(
  268. tmp_path,
  269. [
  270. {"type": "text", "text": "Section A", "text_level": 1, "page_idx": 0},
  271. {"type": "text", "text": "real body", "page_idx": 0},
  272. # Empty body — should be silently dropped, page_idx not recorded.
  273. {"type": "text", "text": "", "page_idx": 98},
  274. {"type": "text", "text": "Section B", "text_level": 1, "page_idx": 1},
  275. {"type": "text", "text": "next body", "page_idx": 1},
  276. ],
  277. )
  278. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="leak.pdf")
  279. assert len(ir.blocks) == 2
  280. a_anchors = [p.anchor for p in ir.blocks[0].positions]
  281. b_anchors = [p.anchor for p in ir.blocks[1].positions]
  282. # Section A only mentions page 1 (page_idx 0 + 1) — NOT 99 from the
  283. # dropped empty item.
  284. assert a_anchors == ["1"]
  285. assert "99" not in a_anchors and "99" not in b_anchors
  286. # Section B only mentions page 2 (page_idx 1 + 1).
  287. assert b_anchors == ["2"]
  288. @pytest.mark.offline
  289. def test_adapter_adjacent_deeper_heading_merged_as_body(tmp_path: Path) -> None:
  290. """Two headings in a row with no body between them: when the second is
  291. strictly deeper (level number larger), it folds into the first heading's
  292. block as a body line. Mirrors the native docx parser's behaviour.
  293. """
  294. raw = _write_bundle(
  295. tmp_path,
  296. [
  297. {"type": "text", "text": "1 Top", "text_level": 1},
  298. {"type": "text", "text": "1.1 Mid", "text_level": 2},
  299. {"type": "text", "text": "1.1.1 Deep", "text_level": 3},
  300. {"type": "text", "text": "Body for deep."},
  301. {"type": "text", "text": "2 Top Again", "text_level": 1},
  302. {"type": "text", "text": "More body."},
  303. ],
  304. )
  305. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="m.pdf")
  306. # First "1 Top" absorbs the immediately-following deeper headings;
  307. # body lands inside the same block. Then a new top-level heading
  308. # opens a fresh block.
  309. assert len(ir.blocks) == 2
  310. merged = ir.blocks[0]
  311. assert merged.heading == "1 Top"
  312. assert merged.level == 1
  313. assert merged.parent_headings == []
  314. assert merged.content_template == (
  315. "# 1 Top\n## 1.1 Mid\n### 1.1.1 Deep\nBody for deep."
  316. )
  317. fresh = ir.blocks[1]
  318. assert fresh.heading == "2 Top Again"
  319. assert fresh.level == 1
  320. # Heading stack reset cleanly — no stale deep parents leak.
  321. assert fresh.parent_headings == []
  322. assert fresh.content_template == "# 2 Top Again\nMore body."
  323. @pytest.mark.offline
  324. def test_adapter_adjacent_shallower_heading_starts_new_block(
  325. tmp_path: Path,
  326. ) -> None:
  327. """Inverse case: when the second adjacent heading is shallower (level
  328. number smaller or equal), it must NOT merge — it starts a new block.
  329. """
  330. raw = _write_bundle(
  331. tmp_path,
  332. [
  333. {"type": "text", "text": "1.1 Mid first", "text_level": 2},
  334. {"type": "text", "text": "2 Top after", "text_level": 1},
  335. {"type": "text", "text": "body"},
  336. ],
  337. )
  338. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="m.pdf")
  339. # The first block is heading-only; the writer downstream will keep it
  340. # (the merged-heading rule only forwards DEEPER headings).
  341. assert len(ir.blocks) == 2
  342. assert ir.blocks[0].heading == "1.1 Mid first"
  343. assert ir.blocks[0].level == 2
  344. assert ir.blocks[0].content_template == "## 1.1 Mid first"
  345. assert ir.blocks[1].heading == "2 Top after"
  346. assert ir.blocks[1].level == 1
  347. assert ir.blocks[1].content_template == "# 2 Top after\nbody"
  348. @pytest.mark.offline
  349. def test_adapter_body_breaks_adjacent_heading_merge(tmp_path: Path) -> None:
  350. """Once any body content lands in the current block, the next heading —
  351. even a deeper one — must flush and open a fresh block (no merge)."""
  352. raw = _write_bundle(
  353. tmp_path,
  354. [
  355. {"type": "text", "text": "1 Top", "text_level": 1},
  356. {"type": "text", "text": "Intro line under 1."},
  357. {"type": "text", "text": "1.1 Mid", "text_level": 2},
  358. {"type": "text", "text": "Mid body."},
  359. ],
  360. )
  361. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="m.pdf")
  362. assert len(ir.blocks) == 2
  363. assert ir.blocks[0].content_template == "# 1 Top\nIntro line under 1."
  364. assert ir.blocks[1].heading == "1.1 Mid"
  365. assert ir.blocks[1].parent_headings == ["1 Top"]
  366. assert ir.blocks[1].content_template == "## 1.1 Mid\nMid body."
  367. @pytest.mark.offline
  368. def test_adapter_block_equation_preserves_dollar_wrappers(tmp_path: Path) -> None:
  369. """Block equations keep the ``$$`` markers verbatim on IREquation.latex
  370. so the writer renders blocks.jsonl's ``<equation>`` body byte-identical
  371. to MinerU's source. The downstream writer is responsible for stripping
  372. them when generating equations.json."""
  373. raw = _write_bundle(
  374. tmp_path,
  375. [
  376. {
  377. "type": "equation",
  378. "text": "$$\n\\int_0^1 x dx = \\tfrac{1}{2}\n$$",
  379. "text_format": "block",
  380. "caption": "Eq A",
  381. },
  382. ],
  383. )
  384. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="b.pdf")
  385. eq = ir.blocks[0].equations[0]
  386. assert eq.is_block is True
  387. # No stripping in the adapter; whitespace.strip() only.
  388. assert eq.latex == "$$\n\\int_0^1 x dx = \\tfrac{1}{2}\n$$"
  389. @pytest.mark.offline
  390. def test_adapter_empty_equation_dropped(tmp_path: Path) -> None:
  391. """Fix 2: equation items with empty text MUST NOT enter the IR (and
  392. consequently not the sidecar). They previously left dangling sidecar
  393. entries."""
  394. raw = _write_bundle(
  395. tmp_path,
  396. [
  397. {"type": "equation", "text": "", "caption": "ghost"},
  398. {"type": "equation", "text": " ", "caption": "ghost"},
  399. {"type": "text", "text": "kept"},
  400. ],
  401. )
  402. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="g.pdf")
  403. eq_count = sum(len(b.equations) for b in ir.blocks)
  404. assert eq_count == 0
  405. assert any(b.content_template == "kept" for b in ir.blocks)
  406. @pytest.mark.offline
  407. def test_adapter_empty_table_dropped(tmp_path: Path) -> None:
  408. """Table items with no usable body MUST NOT enter the IR.
  409. MinerU sometimes misidentifies a page-number / blank region as a table
  410. and emits a body-less ``table`` item (missing ``table_body``/``rows``,
  411. or with an empty string / empty grid). Leaving such items in the IR
  412. would later trip the analyze worker's hard-failure path on empty
  413. ``content``. The IR builder filters them upstream.
  414. """
  415. raw = _write_bundle(
  416. tmp_path,
  417. [
  418. # 1) Body field completely absent.
  419. {"type": "table", "num_rows": 0, "num_cols": 0},
  420. # 2) Empty string body (matches the real m012-manual.pdf bug).
  421. {"type": "table", "table_body": ""},
  422. # 3) Empty list body.
  423. {"type": "table", "rows": []},
  424. # 4) Grid with only blank cells.
  425. {"type": "table", "rows": [["", " "], ["\t", ""]]},
  426. # 5) A real text item so the IR is not entirely empty.
  427. {"type": "text", "text": "kept"},
  428. ],
  429. )
  430. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="t.pdf")
  431. table_count = sum(len(b.tables) for b in ir.blocks)
  432. assert table_count == 0
  433. # No table placeholder should leak into the rendered content either.
  434. joined = "\n".join(b.content_template for b in ir.blocks)
  435. assert "TBL:" not in joined
  436. assert "kept" in joined
  437. @pytest.mark.offline
  438. def test_adapter_bbox_attributes_default_and_override(tmp_path: Path) -> None:
  439. raw = _write_bundle(tmp_path, [{"type": "text", "text": "x"}])
  440. adapter = MinerUIRBuilder()
  441. ir = adapter.normalize_from_workdir(raw, document_name="x.pdf")
  442. assert ir.bbox_attributes == {"origin": "LEFTTOP", "max": 1000}
  443. @pytest.mark.offline
  444. def test_adapter_bbox_attributes_env_override(
  445. tmp_path: Path, monkeypatch: pytest.MonkeyPatch
  446. ) -> None:
  447. monkeypatch.setenv(
  448. "MINERU_BBOX_ATTRIBUTES",
  449. '{"origin": "LEFTBOTTOM", "max": 612}',
  450. )
  451. raw = _write_bundle(tmp_path, [{"type": "text", "text": "x"}])
  452. adapter = MinerUIRBuilder()
  453. ir = adapter.normalize_from_workdir(raw, document_name="x.pdf")
  454. assert ir.bbox_attributes == {"origin": "LEFTBOTTOM", "max": 612}
  455. @pytest.mark.offline
  456. def test_adapter_engine_version_recorded_in_split_option(
  457. tmp_path: Path, monkeypatch: pytest.MonkeyPatch
  458. ) -> None:
  459. monkeypatch.setenv("MINERU_ENGINE_VERSION", "magic-pdf 1.5.4")
  460. raw = _write_bundle(tmp_path, [{"type": "text", "text": "x"}])
  461. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="x.pdf")
  462. assert ir.split_option == {"engine_version": "magic-pdf 1.5.4"}
  463. @pytest.mark.offline
  464. def test_adapter_missing_content_list_raises(tmp_path: Path) -> None:
  465. raw_dir = tmp_path / "bad.mineru_raw"
  466. raw_dir.mkdir()
  467. with pytest.raises(FileNotFoundError):
  468. MinerUIRBuilder().normalize_from_workdir(raw_dir, document_name="x.pdf")
  469. @pytest.mark.offline
  470. def test_adapter_html_table_fallback(tmp_path: Path) -> None:
  471. """If table_body is a string that is not JSON, treat as HTML and keep
  472. on IRTable.html so the writer emits format="html"."""
  473. raw = _write_bundle(
  474. tmp_path,
  475. [
  476. {
  477. "type": "table",
  478. "table_body": "<table><tr><td>a</td></tr></table>",
  479. "num_rows": 1,
  480. "num_cols": 1,
  481. }
  482. ],
  483. )
  484. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="h.pdf")
  485. table = ir.blocks[0].tables[0]
  486. assert table.rows is None
  487. assert table.html and "<td>a</td>" in table.html
  488. @pytest.mark.offline
  489. def test_adapter_list_items_joined_with_newline(tmp_path: Path) -> None:
  490. raw = _write_bundle(
  491. tmp_path,
  492. [
  493. {"type": "list", "list_items": ["one", "two", "three"]},
  494. ],
  495. )
  496. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="l.pdf")
  497. assert ir.blocks[0].content_template == "one\ntwo\nthree"
  498. @pytest.mark.offline
  499. def test_adapter_drawing_asset_source_only_when_file_exists(
  500. tmp_path: Path,
  501. ) -> None:
  502. """The adapter should declare an AssetSpec for the drawing in both
  503. cases, but ``source`` is set only when the bytes are on disk; the
  504. writer then warns and skips a missing-source asset."""
  505. raw = _write_bundle(
  506. tmp_path,
  507. [
  508. {"type": "image", "img_path": "images/exists.png"},
  509. {"type": "image", "img_path": "images/missing.png"},
  510. ],
  511. )
  512. (raw / "images").mkdir()
  513. (raw / "images" / "exists.png").write_bytes(b"\x89PNG")
  514. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="a.pdf")
  515. by_ref = {a.ref: a for a in ir.assets}
  516. assert by_ref["images/exists.png"].source is not None
  517. assert by_ref["images/missing.png"].source is None
  518. @pytest.mark.offline
  519. def test_adapter_refuses_path_traversal_img_path(tmp_path: Path) -> None:
  520. """Untrusted img_path with ``..`` or absolute filesystem segments must
  521. not be allowed to point ``AssetSpec.source`` outside ``raw_dir``.
  522. Otherwise the writer would copy attacker-named files from the host into
  523. the sidecar's ``*.blocks.assets/`` directory (file-disclosure path).
  524. """
  525. # Place a "secret" file outside the raw bundle that should never be
  526. # selectable as an asset source.
  527. secret = tmp_path / "secret.txt"
  528. secret.write_bytes(b"private")
  529. raw = _write_bundle(
  530. tmp_path,
  531. [
  532. {"type": "image", "img_path": "../secret.txt"},
  533. {"type": "image", "img_path": str(secret)}, # absolute path
  534. ],
  535. )
  536. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="x.pdf")
  537. by_ref = {a.ref: a for a in ir.assets}
  538. # Relative ``..`` escape is rejected outright.
  539. assert by_ref["../secret.txt"].source is None
  540. # Absolute filesystem path is reinterpreted as ``images/<basename>``
  541. # inside raw_dir. Since no such file exists, source must remain None
  542. # (and crucially must not point at the original secret file).
  543. abs_asset = by_ref[str(secret)]
  544. assert abs_asset.source is None
  545. @pytest.mark.offline
  546. def test_adapter_absolute_url_img_path_resolves_to_images_basename(
  547. tmp_path: Path,
  548. ) -> None:
  549. """When MinerU emits an absolute URL in img_path, the downloader saves
  550. it as ``images/<basename>``; the adapter must look there too."""
  551. raw = _write_bundle(
  552. tmp_path,
  553. [
  554. {
  555. "type": "image",
  556. "img_path": "https://cdn.example.com/imgs/figure_42.png",
  557. },
  558. ],
  559. )
  560. (raw / "images").mkdir()
  561. (raw / "images" / "figure_42.png").write_bytes(b"\x89PNGfake")
  562. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="u.pdf")
  563. asset = ir.assets[0]
  564. assert asset.ref == "https://cdn.example.com/imgs/figure_42.png"
  565. assert asset.suggested_name == "figure_42.png"
  566. assert asset.source is not None
  567. assert asset.source.read_bytes() == b"\x89PNGfake"
  568. @pytest.mark.offline
  569. def test_adapter_image_url_template_mode_maps_relative_to_images_basename(
  570. tmp_path: Path, monkeypatch: pytest.MonkeyPatch
  571. ) -> None:
  572. """When MINERU_IMAGE_URL_TEMPLATE is set, MinerURawClient stores every
  573. image reference — including relative ones — at ``images/<basename>``.
  574. The adapter must mirror that lookup so the asset is wired up, otherwise
  575. the downloaded bytes are silently dropped from the sidecar."""
  576. monkeypatch.setenv(
  577. "MINERU_IMAGE_URL_TEMPLATE",
  578. "http://mineru.internal/assets/{name}",
  579. )
  580. raw = _write_bundle(
  581. tmp_path,
  582. [
  583. {"type": "image", "img_path": "page/img.png"},
  584. ],
  585. )
  586. # Downloader's actual landing spot in template mode.
  587. (raw / "images").mkdir()
  588. (raw / "images" / "img.png").write_bytes(b"\x89PNGtemplate")
  589. # The "naive" location (raw_dir/page/img.png) does NOT exist; in
  590. # template mode the downloader does not write there.
  591. assert not (raw / "page" / "img.png").exists()
  592. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="t.pdf")
  593. asset = ir.assets[0]
  594. assert asset.source is not None
  595. assert asset.source.read_bytes() == b"\x89PNGtemplate"
  596. @pytest.mark.offline
  597. def test_adapter_no_template_keeps_relative_path_lookup(
  598. tmp_path: Path,
  599. ) -> None:
  600. """Sanity: without MINERU_IMAGE_URL_TEMPLATE, a relative img_path still
  601. resolves under raw_dir at its original location (regression guard for
  602. the template-mode change above)."""
  603. raw = _write_bundle(
  604. tmp_path,
  605. [
  606. {"type": "image", "img_path": "page/img.png"},
  607. ],
  608. )
  609. (raw / "page").mkdir()
  610. (raw / "page" / "img.png").write_bytes(b"\x89PNGrel")
  611. ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="r.pdf")
  612. asset = ir.assets[0]
  613. assert asset.source is not None
  614. assert asset.source.read_bytes() == b"\x89PNGrel"