chunk_schema.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. """Chunk schema helpers shared across the chunking + extraction pipeline.
  2. Three responsibilities live here so chunker implementations and the pipeline
  3. both consume identical normalization rules:
  4. - :func:`normalize_chunk_heading` collapses the legacy flat
  5. ``heading``/``parent_headings``/``level`` triple and the new nested form
  6. into the canonical ``{"level", "heading", "parent_headings"}`` dict.
  7. - :func:`normalize_chunk_sidecar` validates the new ``sidecar`` payload and
  8. ensures ``refs`` is always present as a list (single-source items may omit
  9. it before normalization; we materialize a single-element list for the
  10. storage layer).
  11. - :func:`strip_internal_multimodal_markup_for_extraction` rewrites
  12. ``<cite>`` / ``<drawing>`` / ``<equation>`` markup so the entity-extraction
  13. LLM sees a clean text body. The original ``chunk["content"]`` is never
  14. mutated; the cleaned string is only used to build the extraction prompt.
  15. The clean function is intentionally conservative: it only strips
  16. parser-emitted identifier attributes that have no business reaching the LLM
  17. (``id``, ``refid``, ``path``, ``src``). Visible captions and equation bodies
  18. are preserved so the extracted entities can still ground against them.
  19. """
  20. from __future__ import annotations
  21. import re
  22. from typing import Any
  23. _SIDECAR_TYPES = frozenset({"block", "drawing", "table", "equation"})
  24. def normalize_chunk_heading(dp: dict[str, Any]) -> dict[str, Any] | None:
  25. """Return the canonical nested heading dict or ``None`` when absent.
  26. Accepts:
  27. - ``dp["heading"]`` already a dict ``{"level", "heading", "parent_headings"}``.
  28. - Legacy flat fields ``heading: str`` + ``parent_headings: list[str]`` +
  29. ``level: int``.
  30. Empty / missing inputs collapse to ``None`` so callers can simply omit
  31. the field when writing the chunk record.
  32. """
  33. nested = dp.get("heading")
  34. if isinstance(nested, dict):
  35. heading_text = str(nested.get("heading") or "").strip()
  36. parents_raw = nested.get("parent_headings") or []
  37. level_raw = nested.get("level", 0)
  38. else:
  39. heading_text = str(nested or "").strip()
  40. parents_raw = dp.get("parent_headings") or []
  41. level_raw = dp.get("level", 0)
  42. parent_headings: list[str] = []
  43. if isinstance(parents_raw, list):
  44. for entry in parents_raw:
  45. text = str(entry or "").strip()
  46. if text:
  47. parent_headings.append(text)
  48. try:
  49. level = int(level_raw or 0)
  50. except (TypeError, ValueError):
  51. level = 0
  52. if not heading_text and not parent_headings and level == 0:
  53. return None
  54. return {
  55. "level": level,
  56. "heading": heading_text,
  57. "parent_headings": parent_headings,
  58. }
  59. def normalize_chunk_sidecar(dp: dict[str, Any]) -> dict[str, Any] | None:
  60. """Return the canonical sidecar dict or ``None`` when absent / invalid.
  61. Output shape::
  62. {"type": <one of block|drawing|table|equation>,
  63. "id": <primary source id>,
  64. "refs": [{"type": ..., "id": ...}, ...]}
  65. ``refs`` is always materialized as a list with at least the primary id.
  66. Single-source chunks therefore land in storage with ``refs=[{type,id}]``
  67. so downstream consumers don't need to special-case the field's presence.
  68. """
  69. sidecar = dp.get("sidecar")
  70. if not isinstance(sidecar, dict):
  71. return None
  72. sidecar_type = str(sidecar.get("type") or "").strip()
  73. sidecar_id = str(sidecar.get("id") or "").strip()
  74. if sidecar_type not in _SIDECAR_TYPES or not sidecar_id:
  75. return None
  76. refs_raw = sidecar.get("refs")
  77. refs: list[dict[str, str]] = []
  78. if isinstance(refs_raw, list):
  79. for entry in refs_raw:
  80. if not isinstance(entry, dict):
  81. continue
  82. ref_type = str(entry.get("type") or "").strip()
  83. ref_id = str(entry.get("id") or "").strip()
  84. if ref_type in _SIDECAR_TYPES and ref_id:
  85. refs.append({"type": ref_type, "id": ref_id})
  86. if not refs:
  87. refs = [{"type": sidecar_type, "id": sidecar_id}]
  88. return {"type": sidecar_type, "id": sidecar_id, "refs": refs}
  89. # `<cite type="..." refid="...">visible text</cite>` → `visible text`.
  90. _CITE_RE = re.compile(
  91. r"<cite\b[^>]*>(.*?)</cite>",
  92. flags=re.IGNORECASE | re.DOTALL,
  93. )
  94. # Inner attribute stripper used when the caller wants to *preserve* the
  95. # `<cite type="…">…</cite>` wrapper but drop the parser-internal `refid`.
  96. # Matches ` refid="…"` (leading whitespace + quoted value) so the
  97. # surrounding attribute layout (e.g. `type="table"`) stays intact.
  98. _CITE_REFID_ATTR_RE = re.compile(
  99. r'\s+refid\s*=\s*"[^"]*"',
  100. flags=re.IGNORECASE,
  101. )
  102. # Self-closing `<drawing ...>` placeholder. We keep `caption` (visible) and
  103. # drop `id`, `path`, `src`, `format`, etc. Tags without any caption are
  104. # removed entirely so they don't pollute extraction input.
  105. _DRAWING_RE = re.compile(
  106. r"<drawing\b([^>]*)/>",
  107. flags=re.IGNORECASE,
  108. )
  109. # Container `<equation id="..." format="...">latex</equation>`. Strip
  110. # identifier attributes; preserve the body and the `format` attribute so
  111. # extraction still sees the equation is a structured element.
  112. _EQUATION_RE = re.compile(
  113. r"<equation\b([^>]*)>(.*?)</equation>",
  114. flags=re.IGNORECASE | re.DOTALL,
  115. )
  116. # Container `<table id="tb-..." format="json" caption="...">rows</table>`.
  117. # Native parser emits the internal ``tb-<doc>-NNNN`` identifier here, which
  118. # would otherwise leak into the entity-extraction prompt and become a noisy
  119. # entity. Strip ``id``; keep ``format`` / ``caption`` (and the body verbatim)
  120. # so the extractor still recognizes the element as a structured table.
  121. _TABLE_RE = re.compile(
  122. r"<table\b([^>]*)>(.*?)</table>",
  123. flags=re.IGNORECASE | re.DOTALL,
  124. )
  125. # Match attribute pairs like ``caption="text with \"escapes\""``. We treat
  126. # only the safe identifier-style attributes; complex quoting is rare in
  127. # parser output.
  128. _ATTR_RE = re.compile(
  129. r'(\w+)\s*=\s*"((?:[^"\\]|\\.)*)"',
  130. )
  131. def _attrs_to_dict(attr_string: str) -> dict[str, str]:
  132. return {
  133. match.group(1).lower(): match.group(2)
  134. for match in _ATTR_RE.finditer(attr_string)
  135. }
  136. def _format_attrs(pairs: list[tuple[str, str]]) -> str:
  137. return "".join(f' {k}="{v}"' for k, v in pairs if v)
  138. def _replace_drawing(match: re.Match[str]) -> str:
  139. attrs = _attrs_to_dict(match.group(1))
  140. caption = attrs.get("caption", "")
  141. if not caption.strip():
  142. return ""
  143. return f"<drawing{_format_attrs([('caption', caption)])} />"
  144. def _replace_equation(match: re.Match[str]) -> str:
  145. attrs = _attrs_to_dict(match.group(1))
  146. body = match.group(2)
  147. keep: list[tuple[str, str]] = []
  148. fmt = attrs.get("format", "")
  149. if fmt:
  150. keep.append(("format", fmt))
  151. caption = attrs.get("caption", "")
  152. if caption.strip():
  153. keep.append(("caption", caption))
  154. return f"<equation{_format_attrs(keep)}>{body}</equation>"
  155. def _replace_table(match: re.Match[str]) -> str:
  156. attrs = _attrs_to_dict(match.group(1))
  157. body = match.group(2)
  158. keep: list[tuple[str, str]] = []
  159. fmt = attrs.get("format", "")
  160. if fmt:
  161. keep.append(("format", fmt))
  162. caption = attrs.get("caption", "")
  163. if caption.strip():
  164. keep.append(("caption", caption))
  165. return f"<table{_format_attrs(keep)}>{body}</table>"
  166. def strip_internal_multimodal_markup_for_extraction(
  167. content: str, *, keep_cite_tag: bool = False
  168. ) -> str:
  169. """Strip parser-internal identifiers from a chunk content string.
  170. Only the entity-extraction prompt should receive the cleaned form;
  171. callers must NOT mutate the stored chunk ``content`` so query-time
  172. citations still resolve back to the original parser output.
  173. Transformations always applied:
  174. - ``<drawing id="im-…" path="…" src="…" caption="Fig 1" />``
  175. → ``<drawing caption="Fig 1" />``
  176. (drops the entire tag when no caption is present)
  177. - ``<table id="tb-…" format="json" caption="…">rows</table>``
  178. → ``<table format="json" caption="…">rows</table>``
  179. - ``<equation id="eq-…" format="latex">…</equation>``
  180. → ``<equation format="latex">…</equation>``
  181. Cite-tag handling depends on ``keep_cite_tag``:
  182. - ``keep_cite_tag=False`` (default — entity-extraction path):
  183. ``<cite type="…" refid="…">Table 1</cite>`` → ``Table 1``. The
  184. cite wrapper is dropped so the extractor does not surface it as
  185. a noisy structural entity.
  186. - ``keep_cite_tag=True`` (multimodal-analysis surrounding path):
  187. ``<cite type="table" refid="…">Table 1</cite>`` →
  188. ``<cite type="table">Table 1</cite>``. Only the internal
  189. ``refid`` is removed; the wrapper survives so the VLM/LLM can
  190. tell visible reference labels (e.g. "Table 1") apart from inline
  191. prose.
  192. """
  193. if not content:
  194. return content
  195. if keep_cite_tag:
  196. cleaned = _CITE_REFID_ATTR_RE.sub("", content)
  197. else:
  198. cleaned = _CITE_RE.sub(lambda m: m.group(1), content)
  199. cleaned = _DRAWING_RE.sub(_replace_drawing, cleaned)
  200. cleaned = _TABLE_RE.sub(_replace_table, cleaned)
  201. cleaned = _EQUATION_RE.sub(_replace_equation, cleaned)
  202. return cleaned
  203. __all__ = [
  204. "normalize_chunk_heading",
  205. "normalize_chunk_sidecar",
  206. "strip_internal_multimodal_markup_for_extraction",
  207. ]