"""Intermediate representation (IR) handed by parser adapters to the writer. Parser engines do not write spec-shaped JSON directly. Each engine adapter produces an :class:`IRDoc`; :func:`lightrag.sidecar.writer.write_sidecar` turns that into ``*.parsed/`` files matching ``LightRAGSidecarFormat-zh.md``. Why an in-process IR (not a serialized intermediate): - One executable spec point. ``writer.py`` is the only place that knows id formats, placeholder tags, blockid computation, ``asset_dir`` truth value. - Engine adapters only translate; they never embed knowledge of the on-disk format. - The dataclasses below cover the spec contract plus an ``extras`` escape hatch on item-level objects so engine-specific signals (rowspan, OCR confidence, ...) can be passed through without spec churn. Placeholder convention used by :attr:`IRBlock.content_template`: - ``{{TBL:k}}`` — k is the placeholder key declared on the IRTable object - ``{{IMG:k}}`` — IRDrawing - ``{{EQ:k}}`` — block-level IREquation (``is_block=True``) - ``{{EQI:k}}`` — inline IREquation (``is_block=False``); rendered without an id, never enters ``equations.json`` The writer expands these templates after id allocation. Adapters MUST emit exactly one placeholder per item; multiple in-content placeholders sharing the same key are not supported. """ from __future__ import annotations from dataclasses import dataclass, field from pathlib import Path from typing import Any @dataclass class IRPosition: """Block-level position. See spec §八. ``type`` values: ``"paraid"`` (docx) / ``"bbox"`` (pdf) / ``"heading"`` (md) / ``"absolute"`` (text). ``origin`` is meaningful only for ``type="bbox"`` and acts as a per-position override of ``IRDoc.bbox_attributes.origin`` (spec §八). Leave ``None`` to inherit the document-level origin; set explicitly (e.g. ``"LEFTTOP"`` / ``"LEFTBOTTOM"``) when this position's coordinate system differs from the document default — used by the Docling adapter to record mixed ``coord_origin`` without flipping coordinates. """ type: str anchor: Any = None range: list | None = None charspan: list[int] | None = None origin: str | None = None def to_jsonable(self) -> dict[str, Any]: out: dict[str, Any] = {"type": self.type} if self.anchor is not None: out["anchor"] = self.anchor if self.range is not None: out["range"] = list(self.range) if self.charspan is not None: out["charspan"] = list(self.charspan) if self.origin is not None: out["origin"] = self.origin return out @dataclass class IRTable: """Spec §五. ``rows`` (preferred) or ``html`` describes the body. The writer renders ``{{TBL:placeholder_key}}`` in IRBlock.content_template as ``body
``; ``format`` is chosen by which payload the adapter populated. """ placeholder_key: str rows: list[list[str]] | None = None html: str | None = None num_rows: int = 0 num_cols: int = 0 caption: str = "" footnotes: list[str] = field(default_factory=list) table_header: list[list[str]] | None = None # Spec §五 ``self_ref``: optional pointer into the engine's raw output # (e.g. Docling JSON Pointer ``#/tables/2``). Empty string ⇒ writer # omits the field. Used for traceability back to ``.docling_raw/``. self_ref: str = "" extras: dict[str, Any] = field(default_factory=dict) # Optional verbatim body to render inside the ``…
`` tag # in ``blocks.jsonl``. When set, the writer uses this string in the block # text instead of re-encoding ``rows`` via ``json.dumps`` — preserving # the parser's original whitespace/escaping when byte-equivalence with a # pre-existing output is required. The ``tables.json`` ``content`` field # is unaffected and remains the canonical # ``json.dumps(rows, ensure_ascii=False)`` encoding. # # Coexistence with ``rows`` / ``html``: ``body_override`` does NOT replace # the structured body. ``rows`` (or ``html``) must still be populated for # the sidecar's ``content`` / ``dimension`` / ``format`` fields and for # the writer's ``"json" vs "html"`` format selection. Adapters typically # set BOTH (e.g. native docx sets ``rows`` from the parsed JSON AND sets # ``body_override`` to the raw verbatim string). When JSON parsing fails # in the adapter (``rows`` is None), ``html`` is used as the structured # fallback and the writer renders ``format="html"`` with the body_override # string verbatim — keeping the original (unparseable) bytes intact. body_override: str | None = None @dataclass class IRDrawing: """Spec §四. ``asset_ref`` points to an :class:`AssetSpec` in IRDoc.""" placeholder_key: str asset_ref: str fmt: str = "" caption: str = "" footnotes: list[str] = field(default_factory=list) src: str = "" # Spec §四 ``self_ref``: optional pointer into the engine's raw output # (e.g. Docling JSON Pointer ``#/pictures/3``). Empty string ⇒ writer # omits the field. Used for traceability back to ``.docling_raw/``. self_ref: str = "" extras: dict[str, Any] = field(default_factory=dict) # Optional verbatim path. When set, the writer emits this string in # both the ``blocks.jsonl`` ```` attribute and the # ``drawings.json`` ``path`` field as-is — bypassing # ``asset_paths`` resolution and the ``block_drawing_path_style`` # transformation. Used for linked / external image references (e.g. # ````) that point at bytes not # materialized into ``.blocks.assets/``. path_override: str | None = None @dataclass class IREquation: """Spec §六. ``is_block=False`` ⇒ inline; not allocated an id, not written to ``equations.json``; rendered as ```` in block text. """ placeholder_key: str latex: str is_block: bool = True caption: str = "" footnotes: list[str] = field(default_factory=list) # Spec §六 ``self_ref``: optional pointer into the engine's raw output # (e.g. Docling JSON Pointer ``#/texts/15``). Empty string ⇒ writer # omits the field. Only meaningful when ``is_block=True``; inline # equations never enter ``equations.json``. self_ref: str = "" extras: dict[str, Any] = field(default_factory=dict) @dataclass class IRBlock: """One content block (spec §3.2). ``content_template`` is the final block text with placeholder tokens embedded. The writer expands tokens once ids are assigned. """ content_template: str heading: str = "" level: int = 0 parent_headings: list[str] = field(default_factory=list) session_type: str = "body" table_slice: str = "none" table_header: str | None = None positions: list[IRPosition] = field(default_factory=list) tables: list[IRTable] = field(default_factory=list) drawings: list[IRDrawing] = field(default_factory=list) equations: list[IREquation] = field(default_factory=list) @dataclass class AssetSpec: """Describes one file that lands in ``.blocks.assets/``. ``source`` may be: - :class:`pathlib.Path` to an existing file on disk (writer copies it); - :class:`bytes` payload (writer dumps it); - ``None`` when the file is already in place at ``/`` (e.g. native docx parser writes assets during extraction); the writer then records its size without touching it. Carrier protocol: a drawing references the asset by :attr:`ref`; the writer resolves that to a concrete filename inside the assets dir and writes the result to both ``drawings.json`` (full relative path) and the ```` attribute in ``blocks.jsonl``. """ ref: str suggested_name: str source: Path | bytes | None = None @dataclass class IRDoc: """Top-level IR — the input to :func:`write_sidecar`.""" document_name: str document_format: str doc_title: str split_option: dict[str, Any] blocks: list[IRBlock] assets: list[AssetSpec] = field(default_factory=list) bbox_attributes: dict[str, Any] | None = None