ir.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. """Intermediate representation (IR) handed by parser adapters to the writer.
  2. Parser engines do not write spec-shaped JSON directly. Each engine adapter
  3. produces an :class:`IRDoc`; :func:`lightrag.sidecar.writer.write_sidecar`
  4. turns that into ``*.parsed/`` files matching ``LightRAGSidecarFormat-zh.md``.
  5. Why an in-process IR (not a serialized intermediate):
  6. - One executable spec point. ``writer.py`` is the only place that knows id
  7. formats, placeholder tags, blockid computation, ``asset_dir`` truth value.
  8. - Engine adapters only translate; they never embed knowledge of the on-disk
  9. format.
  10. - The dataclasses below cover the spec contract plus an ``extras`` escape
  11. hatch on item-level objects so engine-specific signals (rowspan, OCR
  12. confidence, ...) can be passed through without spec churn.
  13. Placeholder convention used by :attr:`IRBlock.content_template`:
  14. - ``{{TBL:k}}`` — k is the placeholder key declared on the IRTable object
  15. - ``{{IMG:k}}`` — IRDrawing
  16. - ``{{EQ:k}}`` — block-level IREquation (``is_block=True``)
  17. - ``{{EQI:k}}`` — inline IREquation (``is_block=False``); rendered without an
  18. id, never enters ``equations.json``
  19. The writer expands these templates after id allocation. Adapters MUST emit
  20. exactly one placeholder per item; multiple in-content placeholders sharing
  21. the same key are not supported.
  22. """
  23. from __future__ import annotations
  24. from dataclasses import dataclass, field
  25. from pathlib import Path
  26. from typing import Any
  27. @dataclass
  28. class IRPosition:
  29. """Block-level position. See spec §八.
  30. ``type`` values: ``"paraid"`` (docx) / ``"bbox"`` (pdf) /
  31. ``"heading"`` (md) / ``"absolute"`` (text).
  32. ``origin`` is meaningful only for ``type="bbox"`` and acts as a
  33. per-position override of ``IRDoc.bbox_attributes.origin`` (spec §八).
  34. Leave ``None`` to inherit the document-level origin; set explicitly
  35. (e.g. ``"LEFTTOP"`` / ``"LEFTBOTTOM"``) when this position's
  36. coordinate system differs from the document default — used by the
  37. Docling adapter to record mixed ``coord_origin`` without flipping
  38. coordinates.
  39. """
  40. type: str
  41. anchor: Any = None
  42. range: list | None = None
  43. charspan: list[int] | None = None
  44. origin: str | None = None
  45. def to_jsonable(self) -> dict[str, Any]:
  46. out: dict[str, Any] = {"type": self.type}
  47. if self.anchor is not None:
  48. out["anchor"] = self.anchor
  49. if self.range is not None:
  50. out["range"] = list(self.range)
  51. if self.charspan is not None:
  52. out["charspan"] = list(self.charspan)
  53. if self.origin is not None:
  54. out["origin"] = self.origin
  55. return out
  56. @dataclass
  57. class IRTable:
  58. """Spec §五. ``rows`` (preferred) or ``html`` describes the body.
  59. The writer renders ``{{TBL:placeholder_key}}`` in IRBlock.content_template
  60. as ``<table id="tb-..." format="json|html">body</table>``; ``format``
  61. is chosen by which payload the adapter populated.
  62. """
  63. placeholder_key: str
  64. rows: list[list[str]] | None = None
  65. html: str | None = None
  66. num_rows: int = 0
  67. num_cols: int = 0
  68. caption: str = ""
  69. footnotes: list[str] = field(default_factory=list)
  70. table_header: list[list[str]] | None = None
  71. # Spec §五 ``self_ref``: optional pointer into the engine's raw output
  72. # (e.g. Docling JSON Pointer ``#/tables/2``). Empty string ⇒ writer
  73. # omits the field. Used for traceability back to ``.docling_raw/``.
  74. self_ref: str = ""
  75. extras: dict[str, Any] = field(default_factory=dict)
  76. # Optional verbatim body to render inside the ``<table …>…</table>`` tag
  77. # in ``blocks.jsonl``. When set, the writer uses this string in the block
  78. # text instead of re-encoding ``rows`` via ``json.dumps`` — preserving
  79. # the parser's original whitespace/escaping when byte-equivalence with a
  80. # pre-existing output is required. The ``tables.json`` ``content`` field
  81. # is unaffected and remains the canonical
  82. # ``json.dumps(rows, ensure_ascii=False)`` encoding.
  83. #
  84. # Coexistence with ``rows`` / ``html``: ``body_override`` does NOT replace
  85. # the structured body. ``rows`` (or ``html``) must still be populated for
  86. # the sidecar's ``content`` / ``dimension`` / ``format`` fields and for
  87. # the writer's ``"json" vs "html"`` format selection. Adapters typically
  88. # set BOTH (e.g. native docx sets ``rows`` from the parsed JSON AND sets
  89. # ``body_override`` to the raw verbatim string). When JSON parsing fails
  90. # in the adapter (``rows`` is None), ``html`` is used as the structured
  91. # fallback and the writer renders ``format="html"`` with the body_override
  92. # string verbatim — keeping the original (unparseable) bytes intact.
  93. body_override: str | None = None
  94. @dataclass
  95. class IRDrawing:
  96. """Spec §四. ``asset_ref`` points to an :class:`AssetSpec` in IRDoc."""
  97. placeholder_key: str
  98. asset_ref: str
  99. fmt: str = ""
  100. caption: str = ""
  101. footnotes: list[str] = field(default_factory=list)
  102. src: str = ""
  103. # Spec §四 ``self_ref``: optional pointer into the engine's raw output
  104. # (e.g. Docling JSON Pointer ``#/pictures/3``). Empty string ⇒ writer
  105. # omits the field. Used for traceability back to ``.docling_raw/``.
  106. self_ref: str = ""
  107. extras: dict[str, Any] = field(default_factory=dict)
  108. # Optional verbatim path. When set, the writer emits this string in
  109. # both the ``blocks.jsonl`` ``<drawing path>`` attribute and the
  110. # ``drawings.json`` ``path`` field as-is — bypassing
  111. # ``asset_paths`` resolution and the ``block_drawing_path_style``
  112. # transformation. Used for linked / external image references (e.g.
  113. # ``<drawing path="https://…/img.png" />``) that point at bytes not
  114. # materialized into ``<base>.blocks.assets/``.
  115. path_override: str | None = None
  116. @dataclass
  117. class IREquation:
  118. """Spec §六. ``is_block=False`` ⇒ inline; not allocated an id, not written
  119. to ``equations.json``; rendered as ``<equation format="latex">…</equation>``
  120. in block text.
  121. """
  122. placeholder_key: str
  123. latex: str
  124. is_block: bool = True
  125. caption: str = ""
  126. footnotes: list[str] = field(default_factory=list)
  127. # Spec §六 ``self_ref``: optional pointer into the engine's raw output
  128. # (e.g. Docling JSON Pointer ``#/texts/15``). Empty string ⇒ writer
  129. # omits the field. Only meaningful when ``is_block=True``; inline
  130. # equations never enter ``equations.json``.
  131. self_ref: str = ""
  132. extras: dict[str, Any] = field(default_factory=dict)
  133. @dataclass
  134. class IRBlock:
  135. """One content block (spec §3.2).
  136. ``content_template`` is the final block text with placeholder tokens
  137. embedded. The writer expands tokens once ids are assigned.
  138. """
  139. content_template: str
  140. heading: str = ""
  141. level: int = 0
  142. parent_headings: list[str] = field(default_factory=list)
  143. session_type: str = "body"
  144. table_slice: str = "none"
  145. table_header: str | None = None
  146. positions: list[IRPosition] = field(default_factory=list)
  147. tables: list[IRTable] = field(default_factory=list)
  148. drawings: list[IRDrawing] = field(default_factory=list)
  149. equations: list[IREquation] = field(default_factory=list)
  150. @dataclass
  151. class AssetSpec:
  152. """Describes one file that lands in ``<base>.blocks.assets/``.
  153. ``source`` may be:
  154. - :class:`pathlib.Path` to an existing file on disk (writer copies it);
  155. - :class:`bytes` payload (writer dumps it);
  156. - ``None`` when the file is already in place at ``<assets_dir>/<suggested_name>``
  157. (e.g. native docx parser writes assets during extraction); the writer
  158. then records its size without touching it.
  159. Carrier protocol: a drawing references the asset by :attr:`ref`; the
  160. writer resolves that to a concrete filename inside the assets dir and
  161. writes the result to both ``drawings.json`` (full relative path) and
  162. the ``<drawing path>`` attribute in ``blocks.jsonl``.
  163. """
  164. ref: str
  165. suggested_name: str
  166. source: Path | bytes | None = None
  167. @dataclass
  168. class IRDoc:
  169. """Top-level IR — the input to :func:`write_sidecar`."""
  170. document_name: str
  171. document_format: str
  172. doc_title: str
  173. split_option: dict[str, Any]
  174. blocks: list[IRBlock]
  175. assets: list[AssetSpec] = field(default_factory=list)
  176. bbox_attributes: dict[str, Any] | None = None