"""Spec-compliance tests for :func:`lightrag.sidecar.write_sidecar`. These assertions are deliberately structural: they encode the contract in ``docs/LightRAGSidecarFormat-zh.md`` so accidental regressions in ``writer.py`` show up before downstream chunker / multimodal consumers see malformed sidecars. """ from __future__ import annotations import json from pathlib import Path import pytest from lightrag.sidecar import ( AssetSpec, IRBlock, IRDoc, IRDrawing, IREquation, IRPosition, IRTable, write_sidecar, ) def _load_jsonl(path: Path) -> tuple[dict, list[dict]]: rows: list[dict] = [] meta: dict = {} with path.open("r", encoding="utf-8") as fh: for i, line in enumerate(fh): obj = json.loads(line) if i == 0: meta = obj else: rows.append(obj) return meta, rows @pytest.mark.offline def test_writer_empty_doc_emits_only_blocks_jsonl(tmp_path: Path) -> None: """Document with no blocks: only the meta line, no per-modality JSONs, no assets dir.""" parsed = tmp_path / "empty.parsed" ir = IRDoc( document_name="empty.docx", document_format="docx", doc_title="empty", split_option={}, blocks=[], ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-0001", engine="native") files = {p.name for p in parsed.iterdir()} assert files == {"empty.blocks.jsonl"} meta, rows = _load_jsonl(parsed / "empty.blocks.jsonl") assert meta["type"] == "meta" assert meta["blocks"] == 0 assert meta["asset_dir"] is False assert meta["table_file"] is False assert meta["drawing_file"] is False assert meta["equation_file"] is False assert rows == [] @pytest.mark.offline def test_writer_renders_table_with_inline_body(tmp_path: Path) -> None: """Spec §3.3 / fix 1: rows
; NOT . Also verifies the table's JSON content appears in blocks.jsonl content so doc_hash and F/R/V chunkers see it.""" parsed = tmp_path / "t.parsed" ir = IRDoc( document_name="t.pdf", document_format="pdf", doc_title="t", split_option={}, blocks=[ IRBlock( content_template="prefix {{TBL:t1}} suffix", tables=[ IRTable( placeholder_key="t1", rows=[["a", "b"], ["1", "2"]], num_rows=2, num_cols=2, caption="cap", ) ], ) ], ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-cafebabe", engine="mineru") _, rows = _load_jsonl(parsed / "t.blocks.jsonl") assert len(rows) == 1 body = rows[0]["content"] assert '' in body assert '[["a", "b"], ["1", "2"]]' in body assert "
" in body # Negative: no placeholder anywhere. assert " None: """Spec §四 / fix 5: drawing path always points inside *.blocks.assets/. Asset must be materialized on disk; meta.asset_dir must reflect it. """ parsed = tmp_path / "d.parsed" ir = IRDoc( document_name="d.pdf", document_format="pdf", doc_title="d", split_option={}, blocks=[ IRBlock( content_template="see {{IMG:i1}}", drawings=[ IRDrawing( placeholder_key="i1", asset_ref="img1", fmt="png", caption="figure 1", ) ], ) ], assets=[AssetSpec(ref="img1", suggested_name="x.png", source=b"\x89PNG")], ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-cafebabe", engine="mineru") meta, rows = _load_jsonl(parsed / "d.blocks.jsonl") assert meta["asset_dir"] is True assert meta["drawing_file"] is True body = rows[0]["content"] assert 'path="d.blocks.assets/x.png"' in body assert (parsed / "d.blocks.assets" / "x.png").read_bytes() == b"\x89PNG" drawings = json.loads((parsed / "d.drawings.json").read_text())["drawings"] item = drawings["im-cafebabe-0001"] assert item["path"] == "d.blocks.assets/x.png" assert item["caption"] == "figure 1" assert item["format"] == "png" @pytest.mark.offline def test_writer_equation_strips_dollar_wrappers_for_equations_json( tmp_path: Path, ) -> None: """When IREquation.latex carries MinerU's raw ``$$...$$``/``$..$`` wrappers (preserved so blocks.jsonl shows the source verbatim), the writer must strip them when persisting equations.json content — that file holds clean latex by contract.""" parsed = tmp_path / "d.parsed" ir = IRDoc( document_name="d.pdf", document_format="pdf", doc_title="d", split_option={}, blocks=[ IRBlock( content_template="see {{EQ:b1}}", equations=[ IREquation( placeholder_key="b1", latex="$$\nE = mc^2\n$$", is_block=True, ), ], ) ], ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-deadbeef", engine="mineru") # blocks.jsonl: body preserves the parser's raw form. body = _load_jsonl(parsed / "d.blocks.jsonl")[1][0]["content"] assert ( '$$\nE = mc^2\n$$' in body ) # equations.json: dollar wrappers removed. equations = json.loads((parsed / "d.equations.json").read_text())["equations"] assert equations["eq-deadbeef-0001"]["content"] == "E = mc^2" @pytest.mark.offline def test_writer_equation_caption_preserved_block_and_inline( tmp_path: Path, ) -> None: """Fix 3 + design decision: on both block and inline forms; inline does NOT receive an id and does NOT enter equations.json (spec §6 / §3.3).""" parsed = tmp_path / "e.parsed" ir = IRDoc( document_name="e.pdf", document_format="pdf", doc_title="e", split_option={}, blocks=[ IRBlock( content_template="block {{EQ:b1}} inline {{EQI:i1}}", equations=[ IREquation( placeholder_key="b1", latex="x^2", is_block=True, caption="Eq 1", ), IREquation( placeholder_key="i1", latex="y_n", is_block=False, caption="Inline", ), ], ) ], ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-cafebabe", engine="mineru") body = _load_jsonl(parsed / "e.blocks.jsonl")[1][0]["content"] assert ( 'x^2' in body ) # Inline: no id; caption preserved. assert 'y_n' in body equations = json.loads((parsed / "e.equations.json").read_text())["equations"] # Inline equation should NOT have produced a sidecar entry. assert list(equations.keys()) == ["eq-cafebabe-0001"] assert equations["eq-cafebabe-0001"]["caption"] == "Eq 1" @pytest.mark.offline def test_writer_positions_round_trip_bbox(tmp_path: Path) -> None: """Fix 4: positions go through unchanged. bbox type is the mineru path.""" parsed = tmp_path / "p.parsed" ir = IRDoc( document_name="p.pdf", document_format="pdf", doc_title="p", split_option={}, blocks=[ IRBlock( content_template="text", positions=[ IRPosition(type="bbox", anchor=2, range=[10.0, 20.0, 100.0, 200.0]) ], ) ], ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-aaaa", engine="mineru") rows = _load_jsonl(parsed / "p.blocks.jsonl")[1] assert rows[0]["positions"] == [ {"type": "bbox", "anchor": 2, "range": [10.0, 20.0, 100.0, 200.0]} ] @pytest.mark.offline def test_position_origin_to_jsonable_omits_when_none() -> None: """Spec §八 per-position origin: ``None`` ⇒ field absent (inherit from meta ``bbox_attributes.origin``).""" pos = IRPosition(type="bbox", anchor=1, range=[1.0, 2.0, 3.0, 4.0]) assert "origin" not in pos.to_jsonable() @pytest.mark.offline def test_position_origin_to_jsonable_emits_when_set() -> None: """Spec §八 per-position origin: explicit value ⇒ override field in JSON.""" pos = IRPosition( type="bbox", anchor=1, range=[1.0, 2.0, 3.0, 4.0], origin="LEFTTOP" ) out = pos.to_jsonable() assert out["origin"] == "LEFTTOP" @pytest.mark.offline def test_writer_position_origin_mixed_per_block(tmp_path: Path) -> None: """Docling mixed coord_origin scenario: doc-level origin in meta, per-position override on the minority. Coordinates land verbatim.""" parsed = tmp_path / "mixed.parsed" ir = IRDoc( document_name="mixed.pdf", document_format="pdf", doc_title="mixed", split_option={}, blocks=[ IRBlock( content_template="text", positions=[ IRPosition(type="bbox", anchor=1, range=[10.0, 20.0, 30.0, 40.0]), IRPosition( type="bbox", anchor=1, range=[50.0, 60.0, 70.0, 80.0], origin="LEFTTOP", ), ], ) ], bbox_attributes={"origin": "LEFTBOTTOM"}, ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-bbb1", engine="docling") meta, rows = _load_jsonl(parsed / "mixed.blocks.jsonl") assert meta["bbox_attributes"] == {"origin": "LEFTBOTTOM"} positions = rows[0]["positions"] assert positions[0] == { "type": "bbox", "anchor": 1, "range": [10.0, 20.0, 30.0, 40.0], } assert positions[1] == { "type": "bbox", "anchor": 1, "range": [50.0, 60.0, 70.0, 80.0], "origin": "LEFTTOP", } @pytest.mark.offline def test_writer_drawing_self_ref_emitted_only_when_nonempty(tmp_path: Path) -> None: """Spec §四 ``self_ref``: empty string ⇒ field absent; non-empty ⇒ written verbatim. Keeps MinerU/native sidecars byte-compatible.""" parsed = tmp_path / "sref.parsed" ir = IRDoc( document_name="sref.pdf", document_format="pdf", doc_title="sref", split_option={}, blocks=[ IRBlock( content_template="{{IMG:a}} {{IMG:b}}", drawings=[ IRDrawing(placeholder_key="a", asset_ref="img_a", fmt="png"), IRDrawing( placeholder_key="b", asset_ref="img_b", fmt="png", self_ref="#/pictures/3", ), ], ) ], assets=[ AssetSpec(ref="img_a", suggested_name="a.png", source=b"\x89PNG"), AssetSpec(ref="img_b", suggested_name="b.png", source=b"\x89PNG"), ], ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-ccc1", engine="docling") drawings = json.loads((parsed / "sref.drawings.json").read_text("utf-8"))[ "drawings" ] items = list(drawings.values()) assert "self_ref" not in items[0] assert items[1]["self_ref"] == "#/pictures/3" @pytest.mark.offline def test_writer_table_self_ref_emitted_only_when_nonempty(tmp_path: Path) -> None: """Spec §五 ``self_ref``: same omit-when-empty semantics as drawings.""" parsed = tmp_path / "tsref.parsed" ir = IRDoc( document_name="tsref.pdf", document_format="pdf", doc_title="tsref", split_option={}, blocks=[ IRBlock( content_template="{{TBL:a}} {{TBL:b}}", tables=[ IRTable(placeholder_key="a", rows=[["x"]], num_rows=1, num_cols=1), IRTable( placeholder_key="b", rows=[["y"]], num_rows=1, num_cols=1, self_ref="#/tables/0", ), ], ) ], ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-ddd1", engine="docling") tables = json.loads((parsed / "tsref.tables.json").read_text("utf-8"))["tables"] items = list(tables.values()) assert "self_ref" not in items[0] assert items[1]["self_ref"] == "#/tables/0" @pytest.mark.offline def test_writer_equation_self_ref_emitted_only_when_nonempty(tmp_path: Path) -> None: """Spec §六 ``self_ref``: block equations carry it; inline equations never reach equations.json so the field is moot there.""" parsed = tmp_path / "esref.parsed" ir = IRDoc( document_name="esref.pdf", document_format="pdf", doc_title="esref", split_option={}, blocks=[ IRBlock( content_template="{{EQ:a}} {{EQ:b}}", equations=[ IREquation(placeholder_key="a", latex="a+b", is_block=True), IREquation( placeholder_key="b", latex="c+d", is_block=True, self_ref="#/texts/15", ), ], ) ], ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-eee1", engine="docling") equations = json.loads((parsed / "esref.equations.json").read_text("utf-8"))[ "equations" ] items = list(equations.values()) assert "self_ref" not in items[0] assert items[1]["self_ref"] == "#/texts/15" @pytest.mark.offline def test_writer_id_sequence_is_global_per_kind(tmp_path: Path) -> None: """IDs increment across blocks within their own kind: tables ↑, drawings ↑, equations ↑ — three independent sequences.""" parsed = tmp_path / "s.parsed" blocks = [ IRBlock( content_template="a {{TBL:t}} b {{IMG:i}} c", tables=[IRTable(placeholder_key="t", rows=[["x"]], num_rows=1, num_cols=1)], drawings=[IRDrawing(placeholder_key="i", asset_ref="a1", fmt="png")], ), IRBlock( content_template="d {{EQ:e}} {{TBL:t}}", tables=[IRTable(placeholder_key="t", rows=[["y"]], num_rows=1, num_cols=1)], equations=[IREquation(placeholder_key="e", latex="z", is_block=True)], ), ] ir = IRDoc( document_name="s.pdf", document_format="pdf", doc_title="s", split_option={}, blocks=blocks, assets=[AssetSpec(ref="a1", suggested_name="img.png", source=b"x")], ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-bbbb", engine="mineru") tables = json.loads((parsed / "s.tables.json").read_text())["tables"] assert sorted(tables.keys()) == ["tb-bbbb-0001", "tb-bbbb-0002"] drawings = json.loads((parsed / "s.drawings.json").read_text())["drawings"] assert list(drawings.keys()) == ["im-bbbb-0001"] equations = json.loads((parsed / "s.equations.json").read_text())["equations"] assert list(equations.keys()) == ["eq-bbbb-0001"] @pytest.mark.offline def test_writer_empty_block_dropped(tmp_path: Path) -> None: """An IRBlock that strips to empty after placeholder expansion produces no blocks.jsonl row AND no sidecar items (its in-flight placeholders are stillborn).""" parsed = tmp_path / "empty_block.parsed" ir = IRDoc( document_name="x.pdf", document_format="pdf", doc_title="x", split_option={}, blocks=[ IRBlock( content_template=" \n ", tables=[ IRTable( placeholder_key="orphan", rows=[["a"]], num_rows=1, num_cols=1, ) ], ), IRBlock(content_template="real content"), ], ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-eee", engine="mineru") meta, rows = _load_jsonl(parsed / "x.blocks.jsonl") assert meta["blocks"] == 1 assert len(rows) == 1 assert rows[0]["content"] == "real content" # No tables.json because the orphan placeholder is dropped. assert not (parsed / "x.tables.json").exists() @pytest.mark.offline def test_writer_asset_name_collision_suffixed(tmp_path: Path) -> None: """Two assets with identical suggested_name → second gets ``-2`` stem suffix; drawings.json paths reflect the actual on-disk names.""" parsed = tmp_path / "c.parsed" ir = IRDoc( document_name="c.pdf", document_format="pdf", doc_title="c", split_option={}, blocks=[ IRBlock( content_template="{{IMG:a}} and {{IMG:b}}", drawings=[ IRDrawing(placeholder_key="a", asset_ref="r1", fmt="png"), IRDrawing(placeholder_key="b", asset_ref="r2", fmt="png"), ], ) ], assets=[ AssetSpec(ref="r1", suggested_name="img.png", source=b"a"), AssetSpec(ref="r2", suggested_name="img.png", source=b"b"), ], ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-1111", engine="mineru") assets = sorted(p.name for p in (parsed / "c.blocks.assets").iterdir()) assert assets == ["img-2.png", "img.png"] body = _load_jsonl(parsed / "c.blocks.jsonl")[1][0]["content"] assert 'path="c.blocks.assets/img.png"' in body assert 'path="c.blocks.assets/img-2.png"' in body @pytest.mark.offline def test_writer_meta_has_required_spec_fields(tmp_path: Path) -> None: """Spec §3.1: meta line contains every required field at fixed names.""" parsed = tmp_path / "m.parsed" ir = IRDoc( document_name="m.pdf", document_format="pdf", doc_title="title", split_option={"engine_version": "magic-pdf 1.5.4"}, blocks=[IRBlock(content_template="hello")], bbox_attributes={"origin": "LEFTTOP", "max": 1000}, ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-deadbeef", engine="mineru") meta, _ = _load_jsonl(parsed / "m.blocks.jsonl") for k in ( "type", "format", "version", "document_name", "document_format", "document_hash", "table_file", "equation_file", "drawing_file", "asset_dir", "split_option", "blocks", "doc_id", "parse_engine", "parse_time", "doc_title", ): assert k in meta, f"meta missing field: {k}" assert meta["document_hash"].startswith("sha256:") assert meta["parse_engine"] == "mineru" assert meta["bbox_attributes"] == {"origin": "LEFTTOP", "max": 1000} assert meta["split_option"] == {"engine_version": "magic-pdf 1.5.4"} @pytest.mark.offline def test_writer_sidecar_files_only_when_nonempty(tmp_path: Path) -> None: """tables.json / drawings.json / equations.json are NOT written when the corresponding maps are empty (spec §一 table).""" parsed = tmp_path / "n.parsed" ir = IRDoc( document_name="n.docx", document_format="docx", doc_title="n", split_option={}, blocks=[ IRBlock( content_template="{{IMG:i}}", drawings=[IRDrawing(placeholder_key="i", asset_ref="r", fmt="png")], ) ], assets=[AssetSpec(ref="r", suggested_name="i.png", source=b"x")], ) write_sidecar(ir, parsed_dir=parsed, doc_id="doc-aaaa", engine="native") files = {p.name for p in parsed.iterdir() if p.is_file()} assert "n.drawings.json" in files assert "n.tables.json" not in files assert "n.equations.json" not in files @pytest.mark.offline def test_writer_blockid_formula_stable(tmp_path: Path) -> None: """blockid = md5(doc_id:block_index:heading:content). Same content + metadata → same blockid.""" parsed_a = tmp_path / "a.parsed" parsed_b = tmp_path / "b.parsed" ir = IRDoc( document_name="x.pdf", document_format="pdf", doc_title="x", split_option={}, blocks=[IRBlock(content_template="abc", heading="H", level=1)], ) write_sidecar(ir, parsed_dir=parsed_a, doc_id="doc-fixed", engine="mineru") write_sidecar(ir, parsed_dir=parsed_b, doc_id="doc-fixed", engine="mineru") rows_a = _load_jsonl(parsed_a / "x.blocks.jsonl")[1] rows_b = _load_jsonl(parsed_b / "x.blocks.jsonl")[1] assert rows_a[0]["blockid"] == rows_b[0]["blockid"]