ir_builder.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. """Native DOCX IR builder: ``extract_docx_blocks`` output → :class:`IRDoc`.
  2. Input contract: a list of block dicts as produced by
  3. ``lightrag.parser.docx.parse_document.extract_docx_blocks``. Each
  4. block carries ``content`` text in which ``<table>``, ``<equation>`` and
  5. ``<drawing …/>`` placeholders are already embedded by the upstream parser.
  6. The builder rewrites those placeholders into IR placeholder tokens
  7. (``{{TBL:k}} / {{EQ:k}} / {{EQI:k}} / {{IMG:k}}``) and builds the matching
  8. ``IRTable`` / ``IREquation`` / ``IRDrawing`` items.
  9. Asset bytes are extracted to disk by the upstream parser *before* this
  10. builder runs (via ``DrawingExtractionContext`` passed to
  11. ``extract_docx_blocks``). The builder therefore declares assets with
  12. ``AssetSpec.source=None`` — the writer records each entry's size without
  13. copying.
  14. Block-vs-inline equation distinction follows the legacy native rule: an
  15. ``<equation>…</equation>`` tag is *block* iff each side is either the
  16. content boundary or a ``\\n`` character. Anything else stays inline,
  17. keeps its tag in block text without an id, and never enters
  18. ``equations.json``.
  19. Positions are always emitted as ``IRPosition(type="paraid", range=[start,
  20. end])`` where each side may be ``None`` (legacy / non-Word docx authors
  21. sometimes omit ``w14:paraId``). The writer's ``to_jsonable`` faithfully
  22. preserves the per-side null so consumers can distinguish "start missing"
  23. vs "both missing".
  24. """
  25. from __future__ import annotations
  26. import itertools
  27. import json
  28. import re
  29. from collections.abc import Callable
  30. from dataclasses import dataclass, field
  31. from pathlib import Path, PurePosixPath
  32. from typing import Any
  33. from lightrag.parser.docx.drawing_image_extractor import (
  34. DRAWING_TAG_PATTERN,
  35. parse_drawing_attributes,
  36. )
  37. from lightrag.sidecar.ir import (
  38. AssetSpec,
  39. IRBlock,
  40. IRDoc,
  41. IRDrawing,
  42. IREquation,
  43. IRPosition,
  44. IRTable,
  45. )
  46. _TABLE_TAG_RE = re.compile(r"<table>(.*?)</table>", re.DOTALL)
  47. _EQUATION_TAG_RE = re.compile(r"<equation>(.*?)</equation>", re.DOTALL)
  48. def _normalize_dimension(rows_value: Any) -> tuple[int, int]:
  49. if not isinstance(rows_value, list):
  50. return 0, 0
  51. num_rows = len(rows_value)
  52. num_cols = max((len(r) for r in rows_value if isinstance(r, list)), default=0)
  53. return num_rows, num_cols
  54. def _placeholder_keyspace() -> Callable[[str], str]:
  55. """Return a fresh counter producing ``{prefix}{N}`` keys (1-indexed)."""
  56. counter = itertools.count(1)
  57. return lambda prefix: f"{prefix}{next(counter)}"
  58. def _safe_asset_ref_from_path(path_val: str, asset_prefix: str) -> str | None:
  59. """Return the path inside ``asset_prefix`` only when it is safe.
  60. Native DOCX images are pre-extracted into ``<base>.blocks.assets/``.
  61. Treat a drawing path as local only when the suffix is a clean POSIX
  62. relative path. Unsafe local-looking paths are dropped instead of being
  63. registered as assets or preserved as linked references.
  64. """
  65. if not asset_prefix or not path_val.startswith(asset_prefix):
  66. return None
  67. rel_raw = path_val[len(asset_prefix) :]
  68. if not rel_raw or "\\" in rel_raw:
  69. return None
  70. rel_path = PurePosixPath(rel_raw)
  71. if rel_path.is_absolute():
  72. return None
  73. if any(part == ".." for part in rel_path.parts):
  74. return None
  75. rel = rel_path.as_posix()
  76. if rel in {"", "."}:
  77. return None
  78. return rel
  79. @dataclass
  80. class _BlockBuilder:
  81. """Per-block scratch state for the three ``re.sub`` rewrite passes.
  82. Keeping the replacer routines as bound methods (rather than closures
  83. redefined inside the per-block loop) means they're compiled once at
  84. class-load and the state they mutate — ``tables`` / ``drawings`` /
  85. ``equations`` / ``table_position`` — is held explicitly rather than
  86. captured implicitly from the enclosing frame.
  87. """
  88. next_key: Callable[[str], str]
  89. assets: list[AssetSpec]
  90. seen_asset_refs: set[str]
  91. asset_prefix: str
  92. block_table_headers: list[Any]
  93. tables: list[IRTable] = field(default_factory=list)
  94. drawings: list[IRDrawing] = field(default_factory=list)
  95. equations: list[IREquation] = field(default_factory=list)
  96. # Position of the *next* ``<table>`` placeholder within this block,
  97. # used to look up the matching entry in ``block_table_headers``.
  98. table_position: int = 0
  99. def replace_table(self, match: "re.Match[str]") -> str:
  100. table_body_raw = match.group(1)
  101. try:
  102. rows = json.loads(table_body_raw)
  103. if not isinstance(rows, list):
  104. rows = None
  105. except json.JSONDecodeError:
  106. rows = None
  107. if rows is not None:
  108. parsed_rows: list[list[str]] | None = [
  109. [str(c) for c in r] if isinstance(r, list) else [str(r)] for r in rows
  110. ]
  111. html: str | None = None
  112. else:
  113. parsed_rows = None
  114. html = table_body_raw
  115. num_rows, num_cols = _normalize_dimension(parsed_rows)
  116. header_pos = self.table_position
  117. self.table_position += 1
  118. header_rows = (
  119. self.block_table_headers[header_pos]
  120. if header_pos < len(self.block_table_headers)
  121. else None
  122. )
  123. # Treat empty list / explicit None identically: no header
  124. # entry on the sidecar item.
  125. table_header = header_rows if header_rows else None
  126. placeholder = self.next_key("tb")
  127. self.tables.append(
  128. IRTable(
  129. placeholder_key=placeholder,
  130. rows=parsed_rows,
  131. html=html,
  132. num_rows=num_rows,
  133. num_cols=num_cols,
  134. caption="",
  135. footnotes=[],
  136. table_header=table_header,
  137. body_override=table_body_raw,
  138. )
  139. )
  140. return f"{{{{TBL:{placeholder}}}}}"
  141. def replace_equation(self, match: "re.Match[str]") -> str:
  142. latex = match.group(1)
  143. source = match.string
  144. start, end = match.start(), match.end()
  145. is_block = (start == 0 or source[start - 1] == "\n") and (
  146. end == len(source) or source[end] == "\n"
  147. )
  148. placeholder = self.next_key("eq")
  149. self.equations.append(
  150. IREquation(
  151. placeholder_key=placeholder,
  152. latex=latex,
  153. is_block=is_block,
  154. caption="",
  155. footnotes=[],
  156. )
  157. )
  158. token = "EQ" if is_block else "EQI"
  159. return f"{{{{{token}:{placeholder}}}}}"
  160. def replace_drawing(self, match: "re.Match[str]") -> str:
  161. attrs = parse_drawing_attributes(match.group(0))
  162. path_val = attrs.get("path", "") or ""
  163. src_val = attrs.get("src", "") or ""
  164. fmt = attrs.get("format", "") or ""
  165. if not fmt and path_val:
  166. fmt = Path(path_val).suffix.lower().lstrip(".")
  167. # Two flavours of <drawing path="…">:
  168. # 1. Local asset under <base>.blocks.assets/ — already
  169. # extracted to disk by DrawingExtractionContext;
  170. # register as AssetSpec(source=None) and let the
  171. # writer resolve the path via asset_paths.
  172. # 2. External/linked path (URL, or any path that does
  173. # not live under asset_prefix) — pass through
  174. # verbatim via IRDrawing.path_override; do NOT emit
  175. # an AssetSpec (no on-disk bytes to materialize).
  176. rel_inside_assets = _safe_asset_ref_from_path(path_val, self.asset_prefix)
  177. if rel_inside_assets is not None:
  178. asset_ref = rel_inside_assets
  179. suggested_name = Path(rel_inside_assets).name or rel_inside_assets
  180. if asset_ref and asset_ref not in self.seen_asset_refs:
  181. self.assets.append(
  182. AssetSpec(
  183. ref=asset_ref,
  184. suggested_name=suggested_name,
  185. source=None, # already extracted to disk
  186. )
  187. )
  188. self.seen_asset_refs.add(asset_ref)
  189. path_override: str | None = None
  190. else:
  191. asset_ref = ""
  192. # Only mark as an external/linked reference when the
  193. # upstream parser actually emitted a path. An empty
  194. # ``path=""`` should fall back to the regular asset-
  195. # resolution path (which will also produce ``path=""``
  196. # downstream) rather than masquerading as an explicit
  197. # builder override.
  198. path_override = (
  199. None
  200. if self.asset_prefix and path_val.startswith(self.asset_prefix)
  201. else path_val or None
  202. )
  203. placeholder = self.next_key("im")
  204. self.drawings.append(
  205. IRDrawing(
  206. placeholder_key=placeholder,
  207. asset_ref=asset_ref,
  208. fmt=fmt,
  209. caption="",
  210. footnotes=[],
  211. src=src_val,
  212. path_override=path_override,
  213. )
  214. )
  215. return f"{{{{IMG:{placeholder}}}}}"
  216. class NativeDocxIRBuilder:
  217. """Translate ``extract_docx_blocks`` output into an :class:`IRDoc`.
  218. The builder is stateless — instantiate per call. ``asset_dir_name`` is
  219. the relative name (without trailing slash) of ``<base>.blocks.assets/``
  220. that the upstream parser used when emitting ``<drawing path>``
  221. attributes; the builder strips that prefix when building
  222. :attr:`AssetSpec.ref` so the writer's ref↔filename mapping has
  223. predictable keys.
  224. """
  225. def normalize(
  226. self,
  227. blocks: list[dict[str, Any]],
  228. *,
  229. document_name: str,
  230. asset_dir_name: str,
  231. parse_metadata: dict[str, Any] | None = None,
  232. ) -> IRDoc:
  233. next_key = _placeholder_keyspace()
  234. ir_blocks: list[IRBlock] = []
  235. assets: list[AssetSpec] = []
  236. seen_asset_refs: set[str] = set()
  237. asset_prefix = f"{asset_dir_name}/" if asset_dir_name else ""
  238. for block in blocks:
  239. raw_content = block.get("content") or ""
  240. heading = block.get("heading") or ""
  241. level = int(block.get("level", 0) or 0)
  242. parent_headings = list(block.get("parent_headings") or [])
  243. # Preserve per-side nulls in [start, end].
  244. uuid_start = block.get("uuid") or None
  245. uuid_end = block.get("uuid_end") or None
  246. builder = _BlockBuilder(
  247. next_key=next_key,
  248. assets=assets,
  249. seen_asset_refs=seen_asset_refs,
  250. asset_prefix=asset_prefix,
  251. block_table_headers=list(block.get("table_headers") or []),
  252. )
  253. # Rewrite order matches the legacy native flow: tables, then
  254. # equations, then drawings — each ``re.sub`` operates on the
  255. # output of the previous pass.
  256. content_template = _TABLE_TAG_RE.sub(builder.replace_table, raw_content)
  257. content_template = _EQUATION_TAG_RE.sub(
  258. builder.replace_equation, content_template
  259. )
  260. content_template = DRAWING_TAG_PATTERN.sub(
  261. builder.replace_drawing, content_template
  262. )
  263. positions = [
  264. IRPosition(type="paraid", range=[uuid_start, uuid_end]),
  265. ]
  266. ir_blocks.append(
  267. IRBlock(
  268. content_template=content_template,
  269. heading=heading,
  270. level=level,
  271. parent_headings=parent_headings,
  272. positions=positions,
  273. tables=builder.tables,
  274. drawings=builder.drawings,
  275. equations=builder.equations,
  276. )
  277. )
  278. # doc_title: parse_metadata["first_heading"] when present, else file
  279. # stem fallback (resolved here so the writer doesn't have to know).
  280. first_heading = ""
  281. if isinstance(parse_metadata, dict):
  282. first_heading = str(parse_metadata.get("first_heading") or "")
  283. doc_title = first_heading or (Path(document_name).stem or document_name)
  284. return IRDoc(
  285. document_name=document_name,
  286. document_format=Path(document_name).suffix.lower().lstrip("."),
  287. doc_title=doc_title,
  288. split_option={"fixlevel": 0},
  289. blocks=ir_blocks,
  290. assets=assets,
  291. bbox_attributes=None,
  292. )
  293. __all__ = ["NativeDocxIRBuilder"]