| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612 |
- """Spec-compliance tests for :func:`lightrag.sidecar.write_sidecar`.
- These assertions are deliberately structural: they encode the contract in
- ``docs/LightRAGSidecarFormat-zh.md`` so accidental regressions in
- ``writer.py`` show up before downstream chunker / multimodal consumers see
- malformed sidecars.
- """
- from __future__ import annotations
- import json
- from pathlib import Path
- import pytest
- from lightrag.sidecar import (
- AssetSpec,
- IRBlock,
- IRDoc,
- IRDrawing,
- IREquation,
- IRPosition,
- IRTable,
- write_sidecar,
- )
- def _load_jsonl(path: Path) -> tuple[dict, list[dict]]:
- rows: list[dict] = []
- meta: dict = {}
- with path.open("r", encoding="utf-8") as fh:
- for i, line in enumerate(fh):
- obj = json.loads(line)
- if i == 0:
- meta = obj
- else:
- rows.append(obj)
- return meta, rows
- @pytest.mark.offline
- def test_writer_empty_doc_emits_only_blocks_jsonl(tmp_path: Path) -> None:
- """Document with no blocks: only the meta line, no per-modality JSONs,
- no assets dir."""
- parsed = tmp_path / "empty.parsed"
- ir = IRDoc(
- document_name="empty.docx",
- document_format="docx",
- doc_title="empty",
- split_option={},
- blocks=[],
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-0001", engine="native")
- files = {p.name for p in parsed.iterdir()}
- assert files == {"empty.blocks.jsonl"}
- meta, rows = _load_jsonl(parsed / "empty.blocks.jsonl")
- assert meta["type"] == "meta"
- assert meta["blocks"] == 0
- assert meta["asset_dir"] is False
- assert meta["table_file"] is False
- assert meta["drawing_file"] is False
- assert meta["equation_file"] is False
- assert rows == []
- @pytest.mark.offline
- def test_writer_renders_table_with_inline_body(tmp_path: Path) -> None:
- """Spec §3.3 / fix 1: <table id="..." format="json">rows</table>; NOT
- <cite type="table">. Also verifies the table's JSON content appears in
- blocks.jsonl content so doc_hash and F/R/V chunkers see it."""
- parsed = tmp_path / "t.parsed"
- ir = IRDoc(
- document_name="t.pdf",
- document_format="pdf",
- doc_title="t",
- split_option={},
- blocks=[
- IRBlock(
- content_template="prefix {{TBL:t1}} suffix",
- tables=[
- IRTable(
- placeholder_key="t1",
- rows=[["a", "b"], ["1", "2"]],
- num_rows=2,
- num_cols=2,
- caption="cap",
- )
- ],
- )
- ],
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-cafebabe", engine="mineru")
- _, rows = _load_jsonl(parsed / "t.blocks.jsonl")
- assert len(rows) == 1
- body = rows[0]["content"]
- assert '<table id="tb-cafebabe-0001" format="json">' in body
- assert '[["a", "b"], ["1", "2"]]' in body
- assert "</table>" in body
- # Negative: no <cite type="table"> placeholder anywhere.
- assert "<cite" not in body
- @pytest.mark.offline
- def test_writer_drawing_path_points_into_assets_dir(tmp_path: Path) -> None:
- """Spec §四 / fix 5: drawing path always points inside *.blocks.assets/.
- Asset must be materialized on disk; meta.asset_dir must reflect it.
- """
- parsed = tmp_path / "d.parsed"
- ir = IRDoc(
- document_name="d.pdf",
- document_format="pdf",
- doc_title="d",
- split_option={},
- blocks=[
- IRBlock(
- content_template="see {{IMG:i1}}",
- drawings=[
- IRDrawing(
- placeholder_key="i1",
- asset_ref="img1",
- fmt="png",
- caption="figure 1",
- )
- ],
- )
- ],
- assets=[AssetSpec(ref="img1", suggested_name="x.png", source=b"\x89PNG")],
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-cafebabe", engine="mineru")
- meta, rows = _load_jsonl(parsed / "d.blocks.jsonl")
- assert meta["asset_dir"] is True
- assert meta["drawing_file"] is True
- body = rows[0]["content"]
- assert 'path="d.blocks.assets/x.png"' in body
- assert (parsed / "d.blocks.assets" / "x.png").read_bytes() == b"\x89PNG"
- drawings = json.loads((parsed / "d.drawings.json").read_text())["drawings"]
- item = drawings["im-cafebabe-0001"]
- assert item["path"] == "d.blocks.assets/x.png"
- assert item["caption"] == "figure 1"
- assert item["format"] == "png"
- @pytest.mark.offline
- def test_writer_equation_strips_dollar_wrappers_for_equations_json(
- tmp_path: Path,
- ) -> None:
- """When IREquation.latex carries MinerU's raw ``$$...$$``/``$..$``
- wrappers (preserved so blocks.jsonl shows the source verbatim), the
- writer must strip them when persisting equations.json content — that
- file holds clean latex by contract."""
- parsed = tmp_path / "d.parsed"
- ir = IRDoc(
- document_name="d.pdf",
- document_format="pdf",
- doc_title="d",
- split_option={},
- blocks=[
- IRBlock(
- content_template="see {{EQ:b1}}",
- equations=[
- IREquation(
- placeholder_key="b1",
- latex="$$\nE = mc^2\n$$",
- is_block=True,
- ),
- ],
- )
- ],
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-deadbeef", engine="mineru")
- # blocks.jsonl: <equation> body preserves the parser's raw form.
- body = _load_jsonl(parsed / "d.blocks.jsonl")[1][0]["content"]
- assert (
- '<equation id="eq-deadbeef-0001" format="latex">$$\nE = mc^2\n$$</equation>'
- in body
- )
- # equations.json: dollar wrappers removed.
- equations = json.loads((parsed / "d.equations.json").read_text())["equations"]
- assert equations["eq-deadbeef-0001"]["content"] == "E = mc^2"
- @pytest.mark.offline
- def test_writer_equation_caption_preserved_block_and_inline(
- tmp_path: Path,
- ) -> None:
- """Fix 3 + design decision: <equation caption="..."> on both block and
- inline forms; inline does NOT receive an id and does NOT enter
- equations.json (spec §6 / §3.3)."""
- parsed = tmp_path / "e.parsed"
- ir = IRDoc(
- document_name="e.pdf",
- document_format="pdf",
- doc_title="e",
- split_option={},
- blocks=[
- IRBlock(
- content_template="block {{EQ:b1}} inline {{EQI:i1}}",
- equations=[
- IREquation(
- placeholder_key="b1",
- latex="x^2",
- is_block=True,
- caption="Eq 1",
- ),
- IREquation(
- placeholder_key="i1",
- latex="y_n",
- is_block=False,
- caption="Inline",
- ),
- ],
- )
- ],
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-cafebabe", engine="mineru")
- body = _load_jsonl(parsed / "e.blocks.jsonl")[1][0]["content"]
- assert (
- '<equation id="eq-cafebabe-0001" format="latex" caption="Eq 1">x^2</equation>'
- in body
- )
- # Inline: no id; caption preserved.
- assert '<equation format="latex" caption="Inline">y_n</equation>' in body
- equations = json.loads((parsed / "e.equations.json").read_text())["equations"]
- # Inline equation should NOT have produced a sidecar entry.
- assert list(equations.keys()) == ["eq-cafebabe-0001"]
- assert equations["eq-cafebabe-0001"]["caption"] == "Eq 1"
- @pytest.mark.offline
- def test_writer_positions_round_trip_bbox(tmp_path: Path) -> None:
- """Fix 4: positions go through unchanged. bbox type is the mineru path."""
- parsed = tmp_path / "p.parsed"
- ir = IRDoc(
- document_name="p.pdf",
- document_format="pdf",
- doc_title="p",
- split_option={},
- blocks=[
- IRBlock(
- content_template="text",
- positions=[
- IRPosition(type="bbox", anchor=2, range=[10.0, 20.0, 100.0, 200.0])
- ],
- )
- ],
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-aaaa", engine="mineru")
- rows = _load_jsonl(parsed / "p.blocks.jsonl")[1]
- assert rows[0]["positions"] == [
- {"type": "bbox", "anchor": 2, "range": [10.0, 20.0, 100.0, 200.0]}
- ]
- @pytest.mark.offline
- def test_position_origin_to_jsonable_omits_when_none() -> None:
- """Spec §八 per-position origin: ``None`` ⇒ field absent (inherit from
- meta ``bbox_attributes.origin``)."""
- pos = IRPosition(type="bbox", anchor=1, range=[1.0, 2.0, 3.0, 4.0])
- assert "origin" not in pos.to_jsonable()
- @pytest.mark.offline
- def test_position_origin_to_jsonable_emits_when_set() -> None:
- """Spec §八 per-position origin: explicit value ⇒ override field in JSON."""
- pos = IRPosition(
- type="bbox", anchor=1, range=[1.0, 2.0, 3.0, 4.0], origin="LEFTTOP"
- )
- out = pos.to_jsonable()
- assert out["origin"] == "LEFTTOP"
- @pytest.mark.offline
- def test_writer_position_origin_mixed_per_block(tmp_path: Path) -> None:
- """Docling mixed coord_origin scenario: doc-level origin in meta,
- per-position override on the minority. Coordinates land verbatim."""
- parsed = tmp_path / "mixed.parsed"
- ir = IRDoc(
- document_name="mixed.pdf",
- document_format="pdf",
- doc_title="mixed",
- split_option={},
- blocks=[
- IRBlock(
- content_template="text",
- positions=[
- IRPosition(type="bbox", anchor=1, range=[10.0, 20.0, 30.0, 40.0]),
- IRPosition(
- type="bbox",
- anchor=1,
- range=[50.0, 60.0, 70.0, 80.0],
- origin="LEFTTOP",
- ),
- ],
- )
- ],
- bbox_attributes={"origin": "LEFTBOTTOM"},
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-bbb1", engine="docling")
- meta, rows = _load_jsonl(parsed / "mixed.blocks.jsonl")
- assert meta["bbox_attributes"] == {"origin": "LEFTBOTTOM"}
- positions = rows[0]["positions"]
- assert positions[0] == {
- "type": "bbox",
- "anchor": 1,
- "range": [10.0, 20.0, 30.0, 40.0],
- }
- assert positions[1] == {
- "type": "bbox",
- "anchor": 1,
- "range": [50.0, 60.0, 70.0, 80.0],
- "origin": "LEFTTOP",
- }
- @pytest.mark.offline
- def test_writer_drawing_self_ref_emitted_only_when_nonempty(tmp_path: Path) -> None:
- """Spec §四 ``self_ref``: empty string ⇒ field absent; non-empty ⇒
- written verbatim. Keeps MinerU/native sidecars byte-compatible."""
- parsed = tmp_path / "sref.parsed"
- ir = IRDoc(
- document_name="sref.pdf",
- document_format="pdf",
- doc_title="sref",
- split_option={},
- blocks=[
- IRBlock(
- content_template="{{IMG:a}} {{IMG:b}}",
- drawings=[
- IRDrawing(placeholder_key="a", asset_ref="img_a", fmt="png"),
- IRDrawing(
- placeholder_key="b",
- asset_ref="img_b",
- fmt="png",
- self_ref="#/pictures/3",
- ),
- ],
- )
- ],
- assets=[
- AssetSpec(ref="img_a", suggested_name="a.png", source=b"\x89PNG"),
- AssetSpec(ref="img_b", suggested_name="b.png", source=b"\x89PNG"),
- ],
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-ccc1", engine="docling")
- drawings = json.loads((parsed / "sref.drawings.json").read_text("utf-8"))[
- "drawings"
- ]
- items = list(drawings.values())
- assert "self_ref" not in items[0]
- assert items[1]["self_ref"] == "#/pictures/3"
- @pytest.mark.offline
- def test_writer_table_self_ref_emitted_only_when_nonempty(tmp_path: Path) -> None:
- """Spec §五 ``self_ref``: same omit-when-empty semantics as drawings."""
- parsed = tmp_path / "tsref.parsed"
- ir = IRDoc(
- document_name="tsref.pdf",
- document_format="pdf",
- doc_title="tsref",
- split_option={},
- blocks=[
- IRBlock(
- content_template="{{TBL:a}} {{TBL:b}}",
- tables=[
- IRTable(placeholder_key="a", rows=[["x"]], num_rows=1, num_cols=1),
- IRTable(
- placeholder_key="b",
- rows=[["y"]],
- num_rows=1,
- num_cols=1,
- self_ref="#/tables/0",
- ),
- ],
- )
- ],
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-ddd1", engine="docling")
- tables = json.loads((parsed / "tsref.tables.json").read_text("utf-8"))["tables"]
- items = list(tables.values())
- assert "self_ref" not in items[0]
- assert items[1]["self_ref"] == "#/tables/0"
- @pytest.mark.offline
- def test_writer_equation_self_ref_emitted_only_when_nonempty(tmp_path: Path) -> None:
- """Spec §六 ``self_ref``: block equations carry it; inline equations
- never reach equations.json so the field is moot there."""
- parsed = tmp_path / "esref.parsed"
- ir = IRDoc(
- document_name="esref.pdf",
- document_format="pdf",
- doc_title="esref",
- split_option={},
- blocks=[
- IRBlock(
- content_template="{{EQ:a}} {{EQ:b}}",
- equations=[
- IREquation(placeholder_key="a", latex="a+b", is_block=True),
- IREquation(
- placeholder_key="b",
- latex="c+d",
- is_block=True,
- self_ref="#/texts/15",
- ),
- ],
- )
- ],
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-eee1", engine="docling")
- equations = json.loads((parsed / "esref.equations.json").read_text("utf-8"))[
- "equations"
- ]
- items = list(equations.values())
- assert "self_ref" not in items[0]
- assert items[1]["self_ref"] == "#/texts/15"
- @pytest.mark.offline
- def test_writer_id_sequence_is_global_per_kind(tmp_path: Path) -> None:
- """IDs increment across blocks within their own kind: tables ↑,
- drawings ↑, equations ↑ — three independent sequences."""
- parsed = tmp_path / "s.parsed"
- blocks = [
- IRBlock(
- content_template="a {{TBL:t}} b {{IMG:i}} c",
- tables=[IRTable(placeholder_key="t", rows=[["x"]], num_rows=1, num_cols=1)],
- drawings=[IRDrawing(placeholder_key="i", asset_ref="a1", fmt="png")],
- ),
- IRBlock(
- content_template="d {{EQ:e}} {{TBL:t}}",
- tables=[IRTable(placeholder_key="t", rows=[["y"]], num_rows=1, num_cols=1)],
- equations=[IREquation(placeholder_key="e", latex="z", is_block=True)],
- ),
- ]
- ir = IRDoc(
- document_name="s.pdf",
- document_format="pdf",
- doc_title="s",
- split_option={},
- blocks=blocks,
- assets=[AssetSpec(ref="a1", suggested_name="img.png", source=b"x")],
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-bbbb", engine="mineru")
- tables = json.loads((parsed / "s.tables.json").read_text())["tables"]
- assert sorted(tables.keys()) == ["tb-bbbb-0001", "tb-bbbb-0002"]
- drawings = json.loads((parsed / "s.drawings.json").read_text())["drawings"]
- assert list(drawings.keys()) == ["im-bbbb-0001"]
- equations = json.loads((parsed / "s.equations.json").read_text())["equations"]
- assert list(equations.keys()) == ["eq-bbbb-0001"]
- @pytest.mark.offline
- def test_writer_empty_block_dropped(tmp_path: Path) -> None:
- """An IRBlock that strips to empty after placeholder expansion produces
- no blocks.jsonl row AND no sidecar items (its in-flight placeholders
- are stillborn)."""
- parsed = tmp_path / "empty_block.parsed"
- ir = IRDoc(
- document_name="x.pdf",
- document_format="pdf",
- doc_title="x",
- split_option={},
- blocks=[
- IRBlock(
- content_template=" \n ",
- tables=[
- IRTable(
- placeholder_key="orphan",
- rows=[["a"]],
- num_rows=1,
- num_cols=1,
- )
- ],
- ),
- IRBlock(content_template="real content"),
- ],
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-eee", engine="mineru")
- meta, rows = _load_jsonl(parsed / "x.blocks.jsonl")
- assert meta["blocks"] == 1
- assert len(rows) == 1
- assert rows[0]["content"] == "real content"
- # No tables.json because the orphan placeholder is dropped.
- assert not (parsed / "x.tables.json").exists()
- @pytest.mark.offline
- def test_writer_asset_name_collision_suffixed(tmp_path: Path) -> None:
- """Two assets with identical suggested_name → second gets ``-2`` stem
- suffix; drawings.json paths reflect the actual on-disk names."""
- parsed = tmp_path / "c.parsed"
- ir = IRDoc(
- document_name="c.pdf",
- document_format="pdf",
- doc_title="c",
- split_option={},
- blocks=[
- IRBlock(
- content_template="{{IMG:a}} and {{IMG:b}}",
- drawings=[
- IRDrawing(placeholder_key="a", asset_ref="r1", fmt="png"),
- IRDrawing(placeholder_key="b", asset_ref="r2", fmt="png"),
- ],
- )
- ],
- assets=[
- AssetSpec(ref="r1", suggested_name="img.png", source=b"a"),
- AssetSpec(ref="r2", suggested_name="img.png", source=b"b"),
- ],
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-1111", engine="mineru")
- assets = sorted(p.name for p in (parsed / "c.blocks.assets").iterdir())
- assert assets == ["img-2.png", "img.png"]
- body = _load_jsonl(parsed / "c.blocks.jsonl")[1][0]["content"]
- assert 'path="c.blocks.assets/img.png"' in body
- assert 'path="c.blocks.assets/img-2.png"' in body
- @pytest.mark.offline
- def test_writer_meta_has_required_spec_fields(tmp_path: Path) -> None:
- """Spec §3.1: meta line contains every required field at fixed names."""
- parsed = tmp_path / "m.parsed"
- ir = IRDoc(
- document_name="m.pdf",
- document_format="pdf",
- doc_title="title",
- split_option={"engine_version": "magic-pdf 1.5.4"},
- blocks=[IRBlock(content_template="hello")],
- bbox_attributes={"origin": "LEFTTOP", "max": 1000},
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-deadbeef", engine="mineru")
- meta, _ = _load_jsonl(parsed / "m.blocks.jsonl")
- for k in (
- "type",
- "format",
- "version",
- "document_name",
- "document_format",
- "document_hash",
- "table_file",
- "equation_file",
- "drawing_file",
- "asset_dir",
- "split_option",
- "blocks",
- "doc_id",
- "parse_engine",
- "parse_time",
- "doc_title",
- ):
- assert k in meta, f"meta missing field: {k}"
- assert meta["document_hash"].startswith("sha256:")
- assert meta["parse_engine"] == "mineru"
- assert meta["bbox_attributes"] == {"origin": "LEFTTOP", "max": 1000}
- assert meta["split_option"] == {"engine_version": "magic-pdf 1.5.4"}
- @pytest.mark.offline
- def test_writer_sidecar_files_only_when_nonempty(tmp_path: Path) -> None:
- """tables.json / drawings.json / equations.json are NOT written when
- the corresponding maps are empty (spec §一 table)."""
- parsed = tmp_path / "n.parsed"
- ir = IRDoc(
- document_name="n.docx",
- document_format="docx",
- doc_title="n",
- split_option={},
- blocks=[
- IRBlock(
- content_template="{{IMG:i}}",
- drawings=[IRDrawing(placeholder_key="i", asset_ref="r", fmt="png")],
- )
- ],
- assets=[AssetSpec(ref="r", suggested_name="i.png", source=b"x")],
- )
- write_sidecar(ir, parsed_dir=parsed, doc_id="doc-aaaa", engine="native")
- files = {p.name for p in parsed.iterdir() if p.is_file()}
- assert "n.drawings.json" in files
- assert "n.tables.json" not in files
- assert "n.equations.json" not in files
- @pytest.mark.offline
- def test_writer_blockid_formula_stable(tmp_path: Path) -> None:
- """blockid = md5(doc_id:block_index:heading:content). Same content +
- metadata → same blockid."""
- parsed_a = tmp_path / "a.parsed"
- parsed_b = tmp_path / "b.parsed"
- ir = IRDoc(
- document_name="x.pdf",
- document_format="pdf",
- doc_title="x",
- split_option={},
- blocks=[IRBlock(content_template="abc", heading="H", level=1)],
- )
- write_sidecar(ir, parsed_dir=parsed_a, doc_id="doc-fixed", engine="mineru")
- write_sidecar(ir, parsed_dir=parsed_b, doc_id="doc-fixed", engine="mineru")
- rows_a = _load_jsonl(parsed_a / "x.blocks.jsonl")[1]
- rows_b = _load_jsonl(parsed_b / "x.blocks.jsonl")[1]
- assert rows_a[0]["blockid"] == rows_b[0]["blockid"]
|