test_writer.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612
  1. """Spec-compliance tests for :func:`lightrag.sidecar.write_sidecar`.
  2. These assertions are deliberately structural: they encode the contract in
  3. ``docs/LightRAGSidecarFormat-zh.md`` so accidental regressions in
  4. ``writer.py`` show up before downstream chunker / multimodal consumers see
  5. malformed sidecars.
  6. """
  7. from __future__ import annotations
  8. import json
  9. from pathlib import Path
  10. import pytest
  11. from lightrag.sidecar import (
  12. AssetSpec,
  13. IRBlock,
  14. IRDoc,
  15. IRDrawing,
  16. IREquation,
  17. IRPosition,
  18. IRTable,
  19. write_sidecar,
  20. )
  21. def _load_jsonl(path: Path) -> tuple[dict, list[dict]]:
  22. rows: list[dict] = []
  23. meta: dict = {}
  24. with path.open("r", encoding="utf-8") as fh:
  25. for i, line in enumerate(fh):
  26. obj = json.loads(line)
  27. if i == 0:
  28. meta = obj
  29. else:
  30. rows.append(obj)
  31. return meta, rows
  32. @pytest.mark.offline
  33. def test_writer_empty_doc_emits_only_blocks_jsonl(tmp_path: Path) -> None:
  34. """Document with no blocks: only the meta line, no per-modality JSONs,
  35. no assets dir."""
  36. parsed = tmp_path / "empty.parsed"
  37. ir = IRDoc(
  38. document_name="empty.docx",
  39. document_format="docx",
  40. doc_title="empty",
  41. split_option={},
  42. blocks=[],
  43. )
  44. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-0001", engine="native")
  45. files = {p.name for p in parsed.iterdir()}
  46. assert files == {"empty.blocks.jsonl"}
  47. meta, rows = _load_jsonl(parsed / "empty.blocks.jsonl")
  48. assert meta["type"] == "meta"
  49. assert meta["blocks"] == 0
  50. assert meta["asset_dir"] is False
  51. assert meta["table_file"] is False
  52. assert meta["drawing_file"] is False
  53. assert meta["equation_file"] is False
  54. assert rows == []
  55. @pytest.mark.offline
  56. def test_writer_renders_table_with_inline_body(tmp_path: Path) -> None:
  57. """Spec §3.3 / fix 1: <table id="..." format="json">rows</table>; NOT
  58. <cite type="table">. Also verifies the table's JSON content appears in
  59. blocks.jsonl content so doc_hash and F/R/V chunkers see it."""
  60. parsed = tmp_path / "t.parsed"
  61. ir = IRDoc(
  62. document_name="t.pdf",
  63. document_format="pdf",
  64. doc_title="t",
  65. split_option={},
  66. blocks=[
  67. IRBlock(
  68. content_template="prefix {{TBL:t1}} suffix",
  69. tables=[
  70. IRTable(
  71. placeholder_key="t1",
  72. rows=[["a", "b"], ["1", "2"]],
  73. num_rows=2,
  74. num_cols=2,
  75. caption="cap",
  76. )
  77. ],
  78. )
  79. ],
  80. )
  81. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-cafebabe", engine="mineru")
  82. _, rows = _load_jsonl(parsed / "t.blocks.jsonl")
  83. assert len(rows) == 1
  84. body = rows[0]["content"]
  85. assert '<table id="tb-cafebabe-0001" format="json">' in body
  86. assert '[["a", "b"], ["1", "2"]]' in body
  87. assert "</table>" in body
  88. # Negative: no <cite type="table"> placeholder anywhere.
  89. assert "<cite" not in body
  90. @pytest.mark.offline
  91. def test_writer_drawing_path_points_into_assets_dir(tmp_path: Path) -> None:
  92. """Spec §四 / fix 5: drawing path always points inside *.blocks.assets/.
  93. Asset must be materialized on disk; meta.asset_dir must reflect it.
  94. """
  95. parsed = tmp_path / "d.parsed"
  96. ir = IRDoc(
  97. document_name="d.pdf",
  98. document_format="pdf",
  99. doc_title="d",
  100. split_option={},
  101. blocks=[
  102. IRBlock(
  103. content_template="see {{IMG:i1}}",
  104. drawings=[
  105. IRDrawing(
  106. placeholder_key="i1",
  107. asset_ref="img1",
  108. fmt="png",
  109. caption="figure 1",
  110. )
  111. ],
  112. )
  113. ],
  114. assets=[AssetSpec(ref="img1", suggested_name="x.png", source=b"\x89PNG")],
  115. )
  116. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-cafebabe", engine="mineru")
  117. meta, rows = _load_jsonl(parsed / "d.blocks.jsonl")
  118. assert meta["asset_dir"] is True
  119. assert meta["drawing_file"] is True
  120. body = rows[0]["content"]
  121. assert 'path="d.blocks.assets/x.png"' in body
  122. assert (parsed / "d.blocks.assets" / "x.png").read_bytes() == b"\x89PNG"
  123. drawings = json.loads((parsed / "d.drawings.json").read_text())["drawings"]
  124. item = drawings["im-cafebabe-0001"]
  125. assert item["path"] == "d.blocks.assets/x.png"
  126. assert item["caption"] == "figure 1"
  127. assert item["format"] == "png"
  128. @pytest.mark.offline
  129. def test_writer_equation_strips_dollar_wrappers_for_equations_json(
  130. tmp_path: Path,
  131. ) -> None:
  132. """When IREquation.latex carries MinerU's raw ``$$...$$``/``$..$``
  133. wrappers (preserved so blocks.jsonl shows the source verbatim), the
  134. writer must strip them when persisting equations.json content — that
  135. file holds clean latex by contract."""
  136. parsed = tmp_path / "d.parsed"
  137. ir = IRDoc(
  138. document_name="d.pdf",
  139. document_format="pdf",
  140. doc_title="d",
  141. split_option={},
  142. blocks=[
  143. IRBlock(
  144. content_template="see {{EQ:b1}}",
  145. equations=[
  146. IREquation(
  147. placeholder_key="b1",
  148. latex="$$\nE = mc^2\n$$",
  149. is_block=True,
  150. ),
  151. ],
  152. )
  153. ],
  154. )
  155. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-deadbeef", engine="mineru")
  156. # blocks.jsonl: <equation> body preserves the parser's raw form.
  157. body = _load_jsonl(parsed / "d.blocks.jsonl")[1][0]["content"]
  158. assert (
  159. '<equation id="eq-deadbeef-0001" format="latex">$$\nE = mc^2\n$$</equation>'
  160. in body
  161. )
  162. # equations.json: dollar wrappers removed.
  163. equations = json.loads((parsed / "d.equations.json").read_text())["equations"]
  164. assert equations["eq-deadbeef-0001"]["content"] == "E = mc^2"
  165. @pytest.mark.offline
  166. def test_writer_equation_caption_preserved_block_and_inline(
  167. tmp_path: Path,
  168. ) -> None:
  169. """Fix 3 + design decision: <equation caption="..."> on both block and
  170. inline forms; inline does NOT receive an id and does NOT enter
  171. equations.json (spec §6 / §3.3)."""
  172. parsed = tmp_path / "e.parsed"
  173. ir = IRDoc(
  174. document_name="e.pdf",
  175. document_format="pdf",
  176. doc_title="e",
  177. split_option={},
  178. blocks=[
  179. IRBlock(
  180. content_template="block {{EQ:b1}} inline {{EQI:i1}}",
  181. equations=[
  182. IREquation(
  183. placeholder_key="b1",
  184. latex="x^2",
  185. is_block=True,
  186. caption="Eq 1",
  187. ),
  188. IREquation(
  189. placeholder_key="i1",
  190. latex="y_n",
  191. is_block=False,
  192. caption="Inline",
  193. ),
  194. ],
  195. )
  196. ],
  197. )
  198. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-cafebabe", engine="mineru")
  199. body = _load_jsonl(parsed / "e.blocks.jsonl")[1][0]["content"]
  200. assert (
  201. '<equation id="eq-cafebabe-0001" format="latex" caption="Eq 1">x^2</equation>'
  202. in body
  203. )
  204. # Inline: no id; caption preserved.
  205. assert '<equation format="latex" caption="Inline">y_n</equation>' in body
  206. equations = json.loads((parsed / "e.equations.json").read_text())["equations"]
  207. # Inline equation should NOT have produced a sidecar entry.
  208. assert list(equations.keys()) == ["eq-cafebabe-0001"]
  209. assert equations["eq-cafebabe-0001"]["caption"] == "Eq 1"
  210. @pytest.mark.offline
  211. def test_writer_positions_round_trip_bbox(tmp_path: Path) -> None:
  212. """Fix 4: positions go through unchanged. bbox type is the mineru path."""
  213. parsed = tmp_path / "p.parsed"
  214. ir = IRDoc(
  215. document_name="p.pdf",
  216. document_format="pdf",
  217. doc_title="p",
  218. split_option={},
  219. blocks=[
  220. IRBlock(
  221. content_template="text",
  222. positions=[
  223. IRPosition(type="bbox", anchor=2, range=[10.0, 20.0, 100.0, 200.0])
  224. ],
  225. )
  226. ],
  227. )
  228. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-aaaa", engine="mineru")
  229. rows = _load_jsonl(parsed / "p.blocks.jsonl")[1]
  230. assert rows[0]["positions"] == [
  231. {"type": "bbox", "anchor": 2, "range": [10.0, 20.0, 100.0, 200.0]}
  232. ]
  233. @pytest.mark.offline
  234. def test_position_origin_to_jsonable_omits_when_none() -> None:
  235. """Spec §八 per-position origin: ``None`` ⇒ field absent (inherit from
  236. meta ``bbox_attributes.origin``)."""
  237. pos = IRPosition(type="bbox", anchor=1, range=[1.0, 2.0, 3.0, 4.0])
  238. assert "origin" not in pos.to_jsonable()
  239. @pytest.mark.offline
  240. def test_position_origin_to_jsonable_emits_when_set() -> None:
  241. """Spec §八 per-position origin: explicit value ⇒ override field in JSON."""
  242. pos = IRPosition(
  243. type="bbox", anchor=1, range=[1.0, 2.0, 3.0, 4.0], origin="LEFTTOP"
  244. )
  245. out = pos.to_jsonable()
  246. assert out["origin"] == "LEFTTOP"
  247. @pytest.mark.offline
  248. def test_writer_position_origin_mixed_per_block(tmp_path: Path) -> None:
  249. """Docling mixed coord_origin scenario: doc-level origin in meta,
  250. per-position override on the minority. Coordinates land verbatim."""
  251. parsed = tmp_path / "mixed.parsed"
  252. ir = IRDoc(
  253. document_name="mixed.pdf",
  254. document_format="pdf",
  255. doc_title="mixed",
  256. split_option={},
  257. blocks=[
  258. IRBlock(
  259. content_template="text",
  260. positions=[
  261. IRPosition(type="bbox", anchor=1, range=[10.0, 20.0, 30.0, 40.0]),
  262. IRPosition(
  263. type="bbox",
  264. anchor=1,
  265. range=[50.0, 60.0, 70.0, 80.0],
  266. origin="LEFTTOP",
  267. ),
  268. ],
  269. )
  270. ],
  271. bbox_attributes={"origin": "LEFTBOTTOM"},
  272. )
  273. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-bbb1", engine="docling")
  274. meta, rows = _load_jsonl(parsed / "mixed.blocks.jsonl")
  275. assert meta["bbox_attributes"] == {"origin": "LEFTBOTTOM"}
  276. positions = rows[0]["positions"]
  277. assert positions[0] == {
  278. "type": "bbox",
  279. "anchor": 1,
  280. "range": [10.0, 20.0, 30.0, 40.0],
  281. }
  282. assert positions[1] == {
  283. "type": "bbox",
  284. "anchor": 1,
  285. "range": [50.0, 60.0, 70.0, 80.0],
  286. "origin": "LEFTTOP",
  287. }
  288. @pytest.mark.offline
  289. def test_writer_drawing_self_ref_emitted_only_when_nonempty(tmp_path: Path) -> None:
  290. """Spec §四 ``self_ref``: empty string ⇒ field absent; non-empty ⇒
  291. written verbatim. Keeps MinerU/native sidecars byte-compatible."""
  292. parsed = tmp_path / "sref.parsed"
  293. ir = IRDoc(
  294. document_name="sref.pdf",
  295. document_format="pdf",
  296. doc_title="sref",
  297. split_option={},
  298. blocks=[
  299. IRBlock(
  300. content_template="{{IMG:a}} {{IMG:b}}",
  301. drawings=[
  302. IRDrawing(placeholder_key="a", asset_ref="img_a", fmt="png"),
  303. IRDrawing(
  304. placeholder_key="b",
  305. asset_ref="img_b",
  306. fmt="png",
  307. self_ref="#/pictures/3",
  308. ),
  309. ],
  310. )
  311. ],
  312. assets=[
  313. AssetSpec(ref="img_a", suggested_name="a.png", source=b"\x89PNG"),
  314. AssetSpec(ref="img_b", suggested_name="b.png", source=b"\x89PNG"),
  315. ],
  316. )
  317. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-ccc1", engine="docling")
  318. drawings = json.loads((parsed / "sref.drawings.json").read_text("utf-8"))[
  319. "drawings"
  320. ]
  321. items = list(drawings.values())
  322. assert "self_ref" not in items[0]
  323. assert items[1]["self_ref"] == "#/pictures/3"
  324. @pytest.mark.offline
  325. def test_writer_table_self_ref_emitted_only_when_nonempty(tmp_path: Path) -> None:
  326. """Spec §五 ``self_ref``: same omit-when-empty semantics as drawings."""
  327. parsed = tmp_path / "tsref.parsed"
  328. ir = IRDoc(
  329. document_name="tsref.pdf",
  330. document_format="pdf",
  331. doc_title="tsref",
  332. split_option={},
  333. blocks=[
  334. IRBlock(
  335. content_template="{{TBL:a}} {{TBL:b}}",
  336. tables=[
  337. IRTable(placeholder_key="a", rows=[["x"]], num_rows=1, num_cols=1),
  338. IRTable(
  339. placeholder_key="b",
  340. rows=[["y"]],
  341. num_rows=1,
  342. num_cols=1,
  343. self_ref="#/tables/0",
  344. ),
  345. ],
  346. )
  347. ],
  348. )
  349. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-ddd1", engine="docling")
  350. tables = json.loads((parsed / "tsref.tables.json").read_text("utf-8"))["tables"]
  351. items = list(tables.values())
  352. assert "self_ref" not in items[0]
  353. assert items[1]["self_ref"] == "#/tables/0"
  354. @pytest.mark.offline
  355. def test_writer_equation_self_ref_emitted_only_when_nonempty(tmp_path: Path) -> None:
  356. """Spec §六 ``self_ref``: block equations carry it; inline equations
  357. never reach equations.json so the field is moot there."""
  358. parsed = tmp_path / "esref.parsed"
  359. ir = IRDoc(
  360. document_name="esref.pdf",
  361. document_format="pdf",
  362. doc_title="esref",
  363. split_option={},
  364. blocks=[
  365. IRBlock(
  366. content_template="{{EQ:a}} {{EQ:b}}",
  367. equations=[
  368. IREquation(placeholder_key="a", latex="a+b", is_block=True),
  369. IREquation(
  370. placeholder_key="b",
  371. latex="c+d",
  372. is_block=True,
  373. self_ref="#/texts/15",
  374. ),
  375. ],
  376. )
  377. ],
  378. )
  379. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-eee1", engine="docling")
  380. equations = json.loads((parsed / "esref.equations.json").read_text("utf-8"))[
  381. "equations"
  382. ]
  383. items = list(equations.values())
  384. assert "self_ref" not in items[0]
  385. assert items[1]["self_ref"] == "#/texts/15"
  386. @pytest.mark.offline
  387. def test_writer_id_sequence_is_global_per_kind(tmp_path: Path) -> None:
  388. """IDs increment across blocks within their own kind: tables ↑,
  389. drawings ↑, equations ↑ — three independent sequences."""
  390. parsed = tmp_path / "s.parsed"
  391. blocks = [
  392. IRBlock(
  393. content_template="a {{TBL:t}} b {{IMG:i}} c",
  394. tables=[IRTable(placeholder_key="t", rows=[["x"]], num_rows=1, num_cols=1)],
  395. drawings=[IRDrawing(placeholder_key="i", asset_ref="a1", fmt="png")],
  396. ),
  397. IRBlock(
  398. content_template="d {{EQ:e}} {{TBL:t}}",
  399. tables=[IRTable(placeholder_key="t", rows=[["y"]], num_rows=1, num_cols=1)],
  400. equations=[IREquation(placeholder_key="e", latex="z", is_block=True)],
  401. ),
  402. ]
  403. ir = IRDoc(
  404. document_name="s.pdf",
  405. document_format="pdf",
  406. doc_title="s",
  407. split_option={},
  408. blocks=blocks,
  409. assets=[AssetSpec(ref="a1", suggested_name="img.png", source=b"x")],
  410. )
  411. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-bbbb", engine="mineru")
  412. tables = json.loads((parsed / "s.tables.json").read_text())["tables"]
  413. assert sorted(tables.keys()) == ["tb-bbbb-0001", "tb-bbbb-0002"]
  414. drawings = json.loads((parsed / "s.drawings.json").read_text())["drawings"]
  415. assert list(drawings.keys()) == ["im-bbbb-0001"]
  416. equations = json.loads((parsed / "s.equations.json").read_text())["equations"]
  417. assert list(equations.keys()) == ["eq-bbbb-0001"]
  418. @pytest.mark.offline
  419. def test_writer_empty_block_dropped(tmp_path: Path) -> None:
  420. """An IRBlock that strips to empty after placeholder expansion produces
  421. no blocks.jsonl row AND no sidecar items (its in-flight placeholders
  422. are stillborn)."""
  423. parsed = tmp_path / "empty_block.parsed"
  424. ir = IRDoc(
  425. document_name="x.pdf",
  426. document_format="pdf",
  427. doc_title="x",
  428. split_option={},
  429. blocks=[
  430. IRBlock(
  431. content_template=" \n ",
  432. tables=[
  433. IRTable(
  434. placeholder_key="orphan",
  435. rows=[["a"]],
  436. num_rows=1,
  437. num_cols=1,
  438. )
  439. ],
  440. ),
  441. IRBlock(content_template="real content"),
  442. ],
  443. )
  444. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-eee", engine="mineru")
  445. meta, rows = _load_jsonl(parsed / "x.blocks.jsonl")
  446. assert meta["blocks"] == 1
  447. assert len(rows) == 1
  448. assert rows[0]["content"] == "real content"
  449. # No tables.json because the orphan placeholder is dropped.
  450. assert not (parsed / "x.tables.json").exists()
  451. @pytest.mark.offline
  452. def test_writer_asset_name_collision_suffixed(tmp_path: Path) -> None:
  453. """Two assets with identical suggested_name → second gets ``-2`` stem
  454. suffix; drawings.json paths reflect the actual on-disk names."""
  455. parsed = tmp_path / "c.parsed"
  456. ir = IRDoc(
  457. document_name="c.pdf",
  458. document_format="pdf",
  459. doc_title="c",
  460. split_option={},
  461. blocks=[
  462. IRBlock(
  463. content_template="{{IMG:a}} and {{IMG:b}}",
  464. drawings=[
  465. IRDrawing(placeholder_key="a", asset_ref="r1", fmt="png"),
  466. IRDrawing(placeholder_key="b", asset_ref="r2", fmt="png"),
  467. ],
  468. )
  469. ],
  470. assets=[
  471. AssetSpec(ref="r1", suggested_name="img.png", source=b"a"),
  472. AssetSpec(ref="r2", suggested_name="img.png", source=b"b"),
  473. ],
  474. )
  475. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-1111", engine="mineru")
  476. assets = sorted(p.name for p in (parsed / "c.blocks.assets").iterdir())
  477. assert assets == ["img-2.png", "img.png"]
  478. body = _load_jsonl(parsed / "c.blocks.jsonl")[1][0]["content"]
  479. assert 'path="c.blocks.assets/img.png"' in body
  480. assert 'path="c.blocks.assets/img-2.png"' in body
  481. @pytest.mark.offline
  482. def test_writer_meta_has_required_spec_fields(tmp_path: Path) -> None:
  483. """Spec §3.1: meta line contains every required field at fixed names."""
  484. parsed = tmp_path / "m.parsed"
  485. ir = IRDoc(
  486. document_name="m.pdf",
  487. document_format="pdf",
  488. doc_title="title",
  489. split_option={"engine_version": "magic-pdf 1.5.4"},
  490. blocks=[IRBlock(content_template="hello")],
  491. bbox_attributes={"origin": "LEFTTOP", "max": 1000},
  492. )
  493. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-deadbeef", engine="mineru")
  494. meta, _ = _load_jsonl(parsed / "m.blocks.jsonl")
  495. for k in (
  496. "type",
  497. "format",
  498. "version",
  499. "document_name",
  500. "document_format",
  501. "document_hash",
  502. "table_file",
  503. "equation_file",
  504. "drawing_file",
  505. "asset_dir",
  506. "split_option",
  507. "blocks",
  508. "doc_id",
  509. "parse_engine",
  510. "parse_time",
  511. "doc_title",
  512. ):
  513. assert k in meta, f"meta missing field: {k}"
  514. assert meta["document_hash"].startswith("sha256:")
  515. assert meta["parse_engine"] == "mineru"
  516. assert meta["bbox_attributes"] == {"origin": "LEFTTOP", "max": 1000}
  517. assert meta["split_option"] == {"engine_version": "magic-pdf 1.5.4"}
  518. @pytest.mark.offline
  519. def test_writer_sidecar_files_only_when_nonempty(tmp_path: Path) -> None:
  520. """tables.json / drawings.json / equations.json are NOT written when
  521. the corresponding maps are empty (spec §一 table)."""
  522. parsed = tmp_path / "n.parsed"
  523. ir = IRDoc(
  524. document_name="n.docx",
  525. document_format="docx",
  526. doc_title="n",
  527. split_option={},
  528. blocks=[
  529. IRBlock(
  530. content_template="{{IMG:i}}",
  531. drawings=[IRDrawing(placeholder_key="i", asset_ref="r", fmt="png")],
  532. )
  533. ],
  534. assets=[AssetSpec(ref="r", suggested_name="i.png", source=b"x")],
  535. )
  536. write_sidecar(ir, parsed_dir=parsed, doc_id="doc-aaaa", engine="native")
  537. files = {p.name for p in parsed.iterdir() if p.is_file()}
  538. assert "n.drawings.json" in files
  539. assert "n.tables.json" not in files
  540. assert "n.equations.json" not in files
  541. @pytest.mark.offline
  542. def test_writer_blockid_formula_stable(tmp_path: Path) -> None:
  543. """blockid = md5(doc_id:block_index:heading:content). Same content +
  544. metadata → same blockid."""
  545. parsed_a = tmp_path / "a.parsed"
  546. parsed_b = tmp_path / "b.parsed"
  547. ir = IRDoc(
  548. document_name="x.pdf",
  549. document_format="pdf",
  550. doc_title="x",
  551. split_option={},
  552. blocks=[IRBlock(content_template="abc", heading="H", level=1)],
  553. )
  554. write_sidecar(ir, parsed_dir=parsed_a, doc_id="doc-fixed", engine="mineru")
  555. write_sidecar(ir, parsed_dir=parsed_b, doc_id="doc-fixed", engine="mineru")
  556. rows_a = _load_jsonl(parsed_a / "x.blocks.jsonl")[1]
  557. rows_b = _load_jsonl(parsed_b / "x.blocks.jsonl")[1]
  558. assert rows_a[0]["blockid"] == rows_b[0]["blockid"]