"""Intermediate representation (IR) handed by parser adapters to the writer.
Parser engines do not write spec-shaped JSON directly. Each engine adapter
produces an :class:`IRDoc`; :func:`lightrag.sidecar.writer.write_sidecar`
turns that into ``*.parsed/`` files matching ``LightRAGSidecarFormat-zh.md``.
Why an in-process IR (not a serialized intermediate):
- One executable spec point. ``writer.py`` is the only place that knows id
formats, placeholder tags, blockid computation, ``asset_dir`` truth value.
- Engine adapters only translate; they never embed knowledge of the on-disk
format.
- The dataclasses below cover the spec contract plus an ``extras`` escape
hatch on item-level objects so engine-specific signals (rowspan, OCR
confidence, ...) can be passed through without spec churn.
Placeholder convention used by :attr:`IRBlock.content_template`:
- ``{{TBL:k}}`` — k is the placeholder key declared on the IRTable object
- ``{{IMG:k}}`` — IRDrawing
- ``{{EQ:k}}`` — block-level IREquation (``is_block=True``)
- ``{{EQI:k}}`` — inline IREquation (``is_block=False``); rendered without an
id, never enters ``equations.json``
The writer expands these templates after id allocation. Adapters MUST emit
exactly one placeholder per item; multiple in-content placeholders sharing
the same key are not supported.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
@dataclass
class IRPosition:
"""Block-level position. See spec §八.
``type`` values: ``"paraid"`` (docx) / ``"bbox"`` (pdf) /
``"heading"`` (md) / ``"absolute"`` (text).
``origin`` is meaningful only for ``type="bbox"`` and acts as a
per-position override of ``IRDoc.bbox_attributes.origin`` (spec §八).
Leave ``None`` to inherit the document-level origin; set explicitly
(e.g. ``"LEFTTOP"`` / ``"LEFTBOTTOM"``) when this position's
coordinate system differs from the document default — used by the
Docling adapter to record mixed ``coord_origin`` without flipping
coordinates.
"""
type: str
anchor: Any = None
range: list | None = None
charspan: list[int] | None = None
origin: str | None = None
def to_jsonable(self) -> dict[str, Any]:
out: dict[str, Any] = {"type": self.type}
if self.anchor is not None:
out["anchor"] = self.anchor
if self.range is not None:
out["range"] = list(self.range)
if self.charspan is not None:
out["charspan"] = list(self.charspan)
if self.origin is not None:
out["origin"] = self.origin
return out
@dataclass
class IRTable:
"""Spec §五. ``rows`` (preferred) or ``html`` describes the body.
The writer renders ``{{TBL:placeholder_key}}`` in IRBlock.content_template
as ``
``; ``format``
is chosen by which payload the adapter populated.
"""
placeholder_key: str
rows: list[list[str]] | None = None
html: str | None = None
num_rows: int = 0
num_cols: int = 0
caption: str = ""
footnotes: list[str] = field(default_factory=list)
table_header: list[list[str]] | None = None
# Spec §五 ``self_ref``: optional pointer into the engine's raw output
# (e.g. Docling JSON Pointer ``#/tables/2``). Empty string ⇒ writer
# omits the field. Used for traceability back to ``.docling_raw/``.
self_ref: str = ""
extras: dict[str, Any] = field(default_factory=dict)
# Optional verbatim body to render inside the ```` tag
# in ``blocks.jsonl``. When set, the writer uses this string in the block
# text instead of re-encoding ``rows`` via ``json.dumps`` — preserving
# the parser's original whitespace/escaping when byte-equivalence with a
# pre-existing output is required. The ``tables.json`` ``content`` field
# is unaffected and remains the canonical
# ``json.dumps(rows, ensure_ascii=False)`` encoding.
#
# Coexistence with ``rows`` / ``html``: ``body_override`` does NOT replace
# the structured body. ``rows`` (or ``html``) must still be populated for
# the sidecar's ``content`` / ``dimension`` / ``format`` fields and for
# the writer's ``"json" vs "html"`` format selection. Adapters typically
# set BOTH (e.g. native docx sets ``rows`` from the parsed JSON AND sets
# ``body_override`` to the raw verbatim string). When JSON parsing fails
# in the adapter (``rows`` is None), ``html`` is used as the structured
# fallback and the writer renders ``format="html"`` with the body_override
# string verbatim — keeping the original (unparseable) bytes intact.
body_override: str | None = None
@dataclass
class IRDrawing:
"""Spec §四. ``asset_ref`` points to an :class:`AssetSpec` in IRDoc."""
placeholder_key: str
asset_ref: str
fmt: str = ""
caption: str = ""
footnotes: list[str] = field(default_factory=list)
src: str = ""
# Spec §四 ``self_ref``: optional pointer into the engine's raw output
# (e.g. Docling JSON Pointer ``#/pictures/3``). Empty string ⇒ writer
# omits the field. Used for traceability back to ``.docling_raw/``.
self_ref: str = ""
extras: dict[str, Any] = field(default_factory=dict)
# Optional verbatim path. When set, the writer emits this string in
# both the ``blocks.jsonl`` ```` attribute and the
# ``drawings.json`` ``path`` field as-is — bypassing
# ``asset_paths`` resolution and the ``block_drawing_path_style``
# transformation. Used for linked / external image references (e.g.
# ````) that point at bytes not
# materialized into ``.blocks.assets/``.
path_override: str | None = None
@dataclass
class IREquation:
"""Spec §六. ``is_block=False`` ⇒ inline; not allocated an id, not written
to ``equations.json``; rendered as ``…``
in block text.
"""
placeholder_key: str
latex: str
is_block: bool = True
caption: str = ""
footnotes: list[str] = field(default_factory=list)
# Spec §六 ``self_ref``: optional pointer into the engine's raw output
# (e.g. Docling JSON Pointer ``#/texts/15``). Empty string ⇒ writer
# omits the field. Only meaningful when ``is_block=True``; inline
# equations never enter ``equations.json``.
self_ref: str = ""
extras: dict[str, Any] = field(default_factory=dict)
@dataclass
class IRBlock:
"""One content block (spec §3.2).
``content_template`` is the final block text with placeholder tokens
embedded. The writer expands tokens once ids are assigned.
"""
content_template: str
heading: str = ""
level: int = 0
parent_headings: list[str] = field(default_factory=list)
session_type: str = "body"
table_slice: str = "none"
table_header: str | None = None
positions: list[IRPosition] = field(default_factory=list)
tables: list[IRTable] = field(default_factory=list)
drawings: list[IRDrawing] = field(default_factory=list)
equations: list[IREquation] = field(default_factory=list)
@dataclass
class AssetSpec:
"""Describes one file that lands in ``.blocks.assets/``.
``source`` may be:
- :class:`pathlib.Path` to an existing file on disk (writer copies it);
- :class:`bytes` payload (writer dumps it);
- ``None`` when the file is already in place at ``/``
(e.g. native docx parser writes assets during extraction); the writer
then records its size without touching it.
Carrier protocol: a drawing references the asset by :attr:`ref`; the
writer resolves that to a concrete filename inside the assets dir and
writes the result to both ``drawings.json`` (full relative path) and
the ```` attribute in ``blocks.jsonl``.
"""
ref: str
suggested_name: str
source: Path | bytes | None = None
@dataclass
class IRDoc:
"""Top-level IR — the input to :func:`write_sidecar`."""
document_name: str
document_format: str
doc_title: str
split_option: dict[str, Any]
blocks: list[IRBlock]
assets: list[AssetSpec] = field(default_factory=list)
bbox_attributes: dict[str, Any] | None = None