multimodal_context.py 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028
  1. """Surrounding-context enrichment for native multimodal sidecars.
  2. See ``docs/NativeMultimodalSurroundingContextPlan-zh.md``.
  3. For each entry in ``drawings.json`` / ``tables.json`` / ``equations.json``,
  4. this module locates the matching ``<drawing … id="…" … />``,
  5. ``<table … id="…" …>…</table>`` / table ``<cite refid="…">`` or
  6. ``<equation … id="…" …>…</equation>`` inside the *single*
  7. ``blocks.jsonl`` content row referenced by the entry's ``blockid``, then
  8. extracts up to ``max_tokens`` of leading and trailing text from the same
  9. row (without crossing block rows).
  10. Sidecar entries gain an optional ``surrounding`` field:
  11. {
  12. "leading": "…",
  13. "trailing": "…"
  14. }
  15. with both halves capped at ``max_tokens`` tokens (default 2000).
  16. Truncation prefers paragraph / sentence / clause boundaries (using the
  17. recursive separator cascade from ``CHUNK_R_SEPARATORS`` / falling back
  18. to :data:`lightrag.constants.DEFAULT_R_SEPARATORS`); only when a single
  19. closest segment alone exceeds the budget does the splitter fall through
  20. to a character-level binary search.
  21. Multimodal tags (``<drawing/>``, ``<equation>…</equation>``,
  22. ``<table>…</table>``) inside the candidate text are treated as atomic so
  23. the splitter cannot cut a tag in half. For ``tables.json`` entries —
  24. where the surrounding should describe text around the target table
  25. without dragging other tables along — every ``<table>…</table>`` is
  26. removed from the candidate text *before* token counting and
  27. segmentation, so the saved surrounding string and the tokens budgeted
  28. against it stay in sync. For ``drawings.json`` / ``equations.json``
  29. entries the table tags are preserved when they fit; oversized JSON or
  30. HTML tables are row-trimmed (tail rows for leading, head rows for
  31. trailing) so the surrounding keeps the rows physically closest to the
  32. target.
  33. Parser-internal identifiers (``id`` / ``path`` / ``src`` / ``refid``) are
  34. stripped from the candidate text via
  35. :func:`lightrag.chunk_schema.strip_internal_multimodal_markup_for_extraction`
  36. **before** atomization and token-budgeted truncation. This mirrors the
  37. treatment given to chunk content prior to entity extraction (see
  38. ``lightrag.operate._process_single_content``) and ensures the
  39. multimodal analysis prompt never sees those internal markers. Cleaning
  40. before truncation also guarantees the truncation point can never land
  41. inside an ``id="…"`` attribute and leave a malformed tag the strip
  42. regex would no longer recognize.
  43. Unlike the entity-extraction call site, the surrounding path invokes
  44. the cleaner with ``keep_cite_tag=True``: parser-internal ``refid`` is
  45. removed but the ``<cite type="…">…</cite>`` wrapper is preserved so the
  46. VLM/LLM can still tell a reference label apart from inline prose
  47. (e.g. ``<cite type="table">表1</cite>`` makes it obvious the visible
  48. text "表1" denotes another table elsewhere in the document, rather
  49. than appearing as an ordinary noun phrase). Note this only affects
  50. ``drawings.json`` / ``equations.json`` surroundings — ``tables.json``
  51. surroundings still drop all cite tags via :func:`remove_table_tags`
  52. because the target-table analysis should not be steered by dangling
  53. references to other tables.
  54. """
  55. from __future__ import annotations
  56. import json
  57. import logging
  58. import os
  59. import re
  60. from html import escape as html_escape
  61. from html import unescape as html_unescape
  62. from pathlib import Path
  63. from lightrag.chunk_schema import strip_internal_multimodal_markup_for_extraction
  64. from lightrag.constants import DEFAULT_R_SEPARATORS
  65. from lightrag.table_markup import (
  66. TABLE_TAG_RE,
  67. detect_table_format,
  68. parse_table_tag,
  69. serialize_html_rows,
  70. split_html_rows,
  71. )
  72. from lightrag.utils import Tokenizer
  73. logger = logging.getLogger(__name__)
  74. # ---------------------------------------------------------------------------
  75. # Tag scanner — atomises a string into a list of ``(kind, text)`` pieces so
  76. # the recursive splitter can treat ``<drawing/>``, ``<equation>…</equation>``
  77. # and ``<table>…</table>`` as indivisible.
  78. # ---------------------------------------------------------------------------
  79. _MM_TAG_RE = re.compile(
  80. r"<drawing\b[^>]*/>"
  81. r"|<table\b[^>]*>.*?</table>"
  82. r"|<equation\b[^>]*>.*?</equation>",
  83. re.DOTALL,
  84. )
  85. _TABLE_CITE_RE = re.compile(
  86. r'<cite\b(?=[^>]*\btype\s*=\s*"table")[^>]*>.*?</cite>',
  87. re.DOTALL,
  88. )
  89. def _atomize(text: str) -> list[tuple[str, str]]:
  90. """Split ``text`` into ``(kind, content)`` atoms.
  91. ``kind`` ∈ ``{"text", "drawing", "equation", "table"}``.
  92. Concatenating all atom contents reproduces ``text`` verbatim.
  93. """
  94. atoms: list[tuple[str, str]] = []
  95. pos = 0
  96. for match in _MM_TAG_RE.finditer(text):
  97. if match.start() > pos:
  98. atoms.append(("text", text[pos : match.start()]))
  99. tag_text = match.group(0)
  100. if tag_text.startswith("<drawing"):
  101. kind = "drawing"
  102. elif tag_text.startswith("<table"):
  103. kind = "table"
  104. else:
  105. kind = "equation"
  106. atoms.append((kind, tag_text))
  107. pos = match.end()
  108. if pos < len(text):
  109. atoms.append(("text", text[pos:]))
  110. return atoms
  111. # ---------------------------------------------------------------------------
  112. # Target-tag locators. Each builds a regex that matches a complete tag
  113. # carrying the requested ``id`` attribute, regardless of attribute order.
  114. # ---------------------------------------------------------------------------
  115. def _drawing_pattern(item_id: str) -> re.Pattern[str]:
  116. esc = re.escape(item_id)
  117. return re.compile(
  118. rf'<drawing\b[^>]*?\bid\s*=\s*"{esc}"[^>]*?/>',
  119. re.DOTALL,
  120. )
  121. def _table_pattern(item_id: str) -> re.Pattern[str]:
  122. esc = re.escape(item_id)
  123. return re.compile(
  124. rf'<table\b[^>]*?\bid\s*=\s*"{esc}"[^>]*?>.*?</table>'
  125. rf'|<cite\b(?=[^>]*\btype\s*=\s*"table")'
  126. rf'(?=[^>]*\brefid\s*=\s*"{esc}")[^>]*>.*?</cite>',
  127. re.DOTALL,
  128. )
  129. def _equation_pattern(item_id: str) -> re.Pattern[str]:
  130. esc = re.escape(item_id)
  131. return re.compile(
  132. rf'<equation\b[^>]*?\bid\s*=\s*"{esc}"[^>]*?>.*?</equation>',
  133. re.DOTALL,
  134. )
  135. def find_target_span(
  136. kind: str, item_id: str, block_content: str
  137. ) -> tuple[int, int] | None:
  138. """Locate the target multimodal marker with the given ``id`` inside
  139. ``block_content``.
  140. Returns ``(start, end)`` byte offsets, or ``None`` if not found.
  141. ``kind`` is the sidecar root key — ``"drawings"`` / ``"tables"`` /
  142. ``"equations"``.
  143. """
  144. if kind == "drawings":
  145. pattern = _drawing_pattern(item_id)
  146. elif kind == "tables":
  147. pattern = _table_pattern(item_id)
  148. elif kind == "equations":
  149. pattern = _equation_pattern(item_id)
  150. else:
  151. return None
  152. match = pattern.search(block_content)
  153. if not match:
  154. return None
  155. return match.start(), match.end()
  156. # ---------------------------------------------------------------------------
  157. # Recursive splitter that respects multimodal tag atoms.
  158. # ---------------------------------------------------------------------------
  159. def _split_text_segment(text: str, separators: list[str]) -> tuple[list[str], int]:
  160. """Split ``text`` using the first separator that produces >1 pieces.
  161. Returns ``(segments, sep_index)`` where ``segments`` reproduces
  162. ``text`` verbatim when concatenated and ``sep_index`` is the index
  163. in ``separators`` of the separator that was used. When no listed
  164. separator yields >1 piece the original string is returned as a
  165. single-element list with ``sep_index = len(separators)`` — the
  166. caller is responsible for any further char-level fallback.
  167. The separator is kept attached to the preceding segment so the
  168. assembled accumulator preserves whitespace boundaries.
  169. """
  170. if not text:
  171. return [text], len(separators)
  172. for idx, sep in enumerate(separators):
  173. if not sep:
  174. continue
  175. if sep in text:
  176. parts = text.split(sep)
  177. assembled: list[str] = []
  178. for j, part in enumerate(parts):
  179. if j < len(parts) - 1:
  180. assembled.append(part + sep)
  181. else:
  182. if part:
  183. assembled.append(part)
  184. if len(assembled) > 1:
  185. return assembled, idx
  186. return [text], len(separators)
  187. def _count_tokens(tokenizer: Tokenizer, text: str) -> int:
  188. if not text:
  189. return 0
  190. return len(tokenizer.encode(text))
  191. def _char_trim_leading(text: str, max_tokens: int, tokenizer: Tokenizer) -> str:
  192. """Drop characters from the head until the token count fits.
  193. Used as the final char-level fallback for the ``leading`` half — we
  194. want to keep the *tail* of the text (closest to the target).
  195. """
  196. if _count_tokens(tokenizer, text) <= max_tokens:
  197. return text
  198. lo, hi = 0, len(text)
  199. while lo < hi:
  200. mid = (lo + hi) // 2
  201. if _count_tokens(tokenizer, text[mid:]) <= max_tokens:
  202. hi = mid
  203. else:
  204. lo = mid + 1
  205. return text[lo:]
  206. def _char_trim_trailing(text: str, max_tokens: int, tokenizer: Tokenizer) -> str:
  207. """Drop characters from the tail until the token count fits.
  208. Used as the final char-level fallback for the ``trailing`` half — we
  209. keep the *head* (closest to the target).
  210. """
  211. if _count_tokens(tokenizer, text) <= max_tokens:
  212. return text
  213. lo, hi = 0, len(text)
  214. while lo < hi:
  215. mid = (lo + hi + 1) // 2
  216. if _count_tokens(tokenizer, text[:mid]) <= max_tokens:
  217. lo = mid
  218. else:
  219. hi = mid - 1
  220. return text[:lo]
  221. # ---------------------------------------------------------------------------
  222. # Row-aware table trimming for drawings / equations surrounding.
  223. # ---------------------------------------------------------------------------
  224. def _row_trim_table_leading(
  225. tag_text: str, max_tokens: int, tokenizer: Tokenizer
  226. ) -> str | None:
  227. """Return a smaller ``<table>…</table>`` whose tail rows fit ``max_tokens``.
  228. For a JSON table, takes the last ``k`` rows (closest to the target)
  229. such that the re-wrapped tag still fits. For an HTML table, takes
  230. the last ``k`` ``<tr>``s with their wrapper context. Returns
  231. ``None`` when no row-bounded trim fits.
  232. """
  233. match = TABLE_TAG_RE.match(tag_text.strip())
  234. if not match:
  235. return None
  236. attrs = match.group("attrs")
  237. body = match.group("body")
  238. fmt = detect_table_format(attrs, body)
  239. if fmt == "json":
  240. parsed = parse_table_tag(tag_text)
  241. if not parsed:
  242. return None
  243. attrs_str, rows = parsed
  244. for k in range(len(rows) - 1, 0, -1):
  245. candidate = (
  246. f"<table {attrs_str}>"
  247. f"{json.dumps(rows[-k:], ensure_ascii=False)}"
  248. f"</table>"
  249. )
  250. if _count_tokens(tokenizer, candidate) <= max_tokens:
  251. return candidate
  252. return _char_fallback_json_table(
  253. attrs_str,
  254. json.dumps(rows[-1], ensure_ascii=False) if rows else body,
  255. max_tokens,
  256. tokenizer,
  257. keep_tail=True,
  258. )
  259. if fmt == "html":
  260. rows = split_html_rows(body)
  261. if not rows:
  262. return None
  263. for k in range(len(rows) - 1, 0, -1):
  264. inner = serialize_html_rows(rows[-k:])
  265. candidate = f"<table {attrs}>{inner}</table>"
  266. if _count_tokens(tokenizer, candidate) <= max_tokens:
  267. return candidate
  268. return _char_fallback_html_table(
  269. attrs,
  270. rows[-1][1] if rows else body,
  271. max_tokens,
  272. tokenizer,
  273. keep_tail=True,
  274. )
  275. return None
  276. def _row_trim_table_trailing(
  277. tag_text: str, max_tokens: int, tokenizer: Tokenizer
  278. ) -> str | None:
  279. """Return a smaller ``<table>…</table>`` whose head rows fit ``max_tokens``."""
  280. match = TABLE_TAG_RE.match(tag_text.strip())
  281. if not match:
  282. return None
  283. attrs = match.group("attrs")
  284. body = match.group("body")
  285. fmt = detect_table_format(attrs, body)
  286. if fmt == "json":
  287. parsed = parse_table_tag(tag_text)
  288. if not parsed:
  289. return None
  290. attrs_str, rows = parsed
  291. for k in range(len(rows) - 1, 0, -1):
  292. candidate = (
  293. f"<table {attrs_str}>"
  294. f"{json.dumps(rows[:k], ensure_ascii=False)}"
  295. f"</table>"
  296. )
  297. if _count_tokens(tokenizer, candidate) <= max_tokens:
  298. return candidate
  299. return _char_fallback_json_table(
  300. attrs_str,
  301. json.dumps(rows[0], ensure_ascii=False) if rows else body,
  302. max_tokens,
  303. tokenizer,
  304. keep_tail=False,
  305. )
  306. if fmt == "html":
  307. rows = split_html_rows(body)
  308. if not rows:
  309. return None
  310. for k in range(len(rows) - 1, 0, -1):
  311. inner = serialize_html_rows(rows[:k])
  312. candidate = f"<table {attrs}>{inner}</table>"
  313. if _count_tokens(tokenizer, candidate) <= max_tokens:
  314. return candidate
  315. return _char_fallback_html_table(
  316. attrs,
  317. rows[0][1] if rows else body,
  318. max_tokens,
  319. tokenizer,
  320. keep_tail=False,
  321. )
  322. return None
  323. def _empty_table(attrs: str) -> str:
  324. return f"<table {attrs}></table>"
  325. def _char_fallback_json_table(
  326. attrs: str,
  327. source_text: str,
  328. max_tokens: int,
  329. tokenizer: Tokenizer,
  330. *,
  331. keep_tail: bool,
  332. ) -> str | None:
  333. """Fit one oversized JSON table row while keeping a valid table tag.
  334. The fallback stores the truncated serialized row text as a JSON string
  335. inside a one-row table. That preserves JSON validity and keeps the
  336. closest side of the oversized row when no complete row can fit.
  337. """
  338. empty = _empty_table(attrs)
  339. if _count_tokens(tokenizer, empty) > max_tokens:
  340. return None
  341. def candidate(chars: int) -> str:
  342. snippet = source_text[-chars:] if keep_tail and chars else source_text[:chars]
  343. if not chars:
  344. return empty
  345. body = json.dumps([[snippet]], ensure_ascii=False)
  346. return f"<table {attrs}>{body}</table>"
  347. if _count_tokens(tokenizer, candidate(len(source_text))) <= max_tokens:
  348. return candidate(len(source_text))
  349. lo, hi = 0, len(source_text)
  350. while lo < hi:
  351. mid = (lo + hi + 1) // 2
  352. if _count_tokens(tokenizer, candidate(mid)) <= max_tokens:
  353. lo = mid
  354. else:
  355. hi = mid - 1
  356. return candidate(lo)
  357. def _char_fallback_html_table(
  358. attrs: str,
  359. row_html: str,
  360. max_tokens: int,
  361. tokenizer: Tokenizer,
  362. *,
  363. keep_tail: bool,
  364. ) -> str | None:
  365. """Fit one oversized HTML row without emitting broken table markup."""
  366. empty = _empty_table(attrs)
  367. if _count_tokens(tokenizer, empty) > max_tokens:
  368. return None
  369. text = html_unescape(re.sub(r"<[^>]+>", "", row_html or ""))
  370. def candidate(chars: int) -> str:
  371. snippet = text[-chars:] if keep_tail and chars else text[:chars]
  372. if not chars:
  373. return empty
  374. return f"<table {attrs}><tr><td>{html_escape(snippet)}</td></tr></table>"
  375. if _count_tokens(tokenizer, candidate(len(text))) <= max_tokens:
  376. return candidate(len(text))
  377. lo, hi = 0, len(text)
  378. while lo < hi:
  379. mid = (lo + hi + 1) // 2
  380. if _count_tokens(tokenizer, candidate(mid)) <= max_tokens:
  381. lo = mid
  382. else:
  383. hi = mid - 1
  384. return candidate(lo)
  385. def remove_table_tags(text: str) -> str:
  386. """Strip every table marker from ``text``.
  387. Used to pre-clean candidate text for ``tables.json`` surroundings:
  388. we never include sibling tables, so they must be dropped *before*
  389. token counting and segmentation so the budget matches the persisted
  390. string exactly.
  391. """
  392. return _TABLE_CITE_RE.sub("", TABLE_TAG_RE.sub("", text))
  393. # ---------------------------------------------------------------------------
  394. # Core leading / trailing builders.
  395. # ---------------------------------------------------------------------------
  396. def _build_leading(
  397. source: str,
  398. *,
  399. kind: str,
  400. tokenizer: Tokenizer,
  401. max_tokens: int,
  402. separators: list[str],
  403. ) -> str:
  404. """Build the ``leading`` half: suffix of ``source`` within budget.
  405. ``source`` is cleaned via
  406. :func:`lightrag.chunk_schema.strip_internal_multimodal_markup_for_extraction`
  407. *before* atomization and token-budgeted accumulation, so parser-internal
  408. identifiers (``id`` / ``path`` / ``src`` / ``refid``) never reach the
  409. accumulated output and the token budget reflects what the LLM actually
  410. sees. Cleaning before truncation also prevents a truncation point from
  411. landing inside an ``id="…"`` attribute and producing a malformed tag
  412. that the strip regex would no longer recognize.
  413. """
  414. if not source or max_tokens <= 0:
  415. return ""
  416. if kind == "tables":
  417. source = remove_table_tags(source)
  418. if not source:
  419. return ""
  420. source = strip_internal_multimodal_markup_for_extraction(source, keep_cite_tag=True)
  421. if not source:
  422. return ""
  423. accumulated = ""
  424. atoms = _atomize(source)
  425. for atom_idx in range(len(atoms) - 1, -1, -1):
  426. atom_kind, atom_text = atoms[atom_idx]
  427. if not atom_text:
  428. continue
  429. if atom_kind in {"drawing", "equation"}:
  430. candidate = atom_text + accumulated
  431. if _count_tokens(tokenizer, candidate) <= max_tokens:
  432. accumulated = candidate
  433. continue
  434. break
  435. if atom_kind == "table":
  436. # Only reached for drawings/equations surroundings — table
  437. # tags are pre-stripped for the ``tables`` kind above.
  438. candidate = atom_text + accumulated
  439. if _count_tokens(tokenizer, candidate) <= max_tokens:
  440. accumulated = candidate
  441. continue
  442. remaining = max_tokens - _count_tokens(tokenizer, accumulated)
  443. if remaining > 0:
  444. trimmed = _row_trim_table_leading(atom_text, remaining, tokenizer)
  445. if trimmed is not None:
  446. accumulated = trimmed + accumulated
  447. break
  448. # Plain text atom — segment with separator cascade and accumulate
  449. # from the right.
  450. addition = _accumulate_text_leading(
  451. atom_text,
  452. existing=accumulated,
  453. tokenizer=tokenizer,
  454. max_tokens=max_tokens,
  455. separators=separators,
  456. )
  457. if addition is None:
  458. # Even a partial fit was not possible; we stop here.
  459. break
  460. accumulated = addition + accumulated
  461. if _count_tokens(tokenizer, accumulated) >= max_tokens:
  462. break
  463. return accumulated
  464. def _accumulate_text_leading(
  465. text: str,
  466. *,
  467. existing: str,
  468. tokenizer: Tokenizer,
  469. max_tokens: int,
  470. separators: list[str],
  471. ) -> str | None:
  472. """Add as much of ``text`` (suffix) as fits into the remaining budget.
  473. Returns the chunk to prepend to ``existing``, or ``None`` to signal
  474. "stop walking earlier atoms" (i.e. budget exhausted with no useful
  475. addition).
  476. """
  477. segments, sep_idx = _split_text_segment(text, separators)
  478. if not segments:
  479. return None
  480. # Try to add whole segments from the right. ``buf`` is what we will
  481. # prepend to ``existing``.
  482. buf = ""
  483. for i in range(len(segments) - 1, -1, -1):
  484. candidate = segments[i] + buf
  485. # Total tokens once we prepend ``candidate`` to ``existing``.
  486. if _count_tokens(tokenizer, candidate + existing) <= max_tokens:
  487. buf = candidate
  488. continue
  489. # Cannot fit segment ``i`` whole. Two cases:
  490. if buf:
  491. # We already added at least one segment — stop here without
  492. # char-truncating a more-distant segment.
  493. return buf
  494. # ``buf`` is empty: the closest segment alone overflows. Recurse
  495. # into the next separator level so we try a finer split before
  496. # falling back to characters.
  497. weaker = separators[sep_idx + 1 :] if sep_idx < len(separators) else []
  498. if weaker:
  499. return _accumulate_text_leading(
  500. segments[i],
  501. existing=existing,
  502. tokenizer=tokenizer,
  503. max_tokens=max_tokens,
  504. separators=weaker,
  505. )
  506. # Char-level fallback: take the longest suffix of this segment
  507. # that fits the remaining budget.
  508. remaining = max_tokens - _count_tokens(tokenizer, existing)
  509. if remaining <= 0:
  510. return None
  511. trimmed = _char_trim_leading(segments[i], remaining, tokenizer)
  512. return trimmed if trimmed else None
  513. return buf if buf else None
  514. def _build_trailing(
  515. source: str,
  516. *,
  517. kind: str,
  518. tokenizer: Tokenizer,
  519. max_tokens: int,
  520. separators: list[str],
  521. ) -> str:
  522. """Build the ``trailing`` half: prefix of ``source`` within budget.
  523. See :func:`_build_leading` for the rationale behind stripping
  524. parser-internal markers *before* atomization and truncation.
  525. """
  526. if not source or max_tokens <= 0:
  527. return ""
  528. if kind == "tables":
  529. source = remove_table_tags(source)
  530. if not source:
  531. return ""
  532. source = strip_internal_multimodal_markup_for_extraction(source, keep_cite_tag=True)
  533. if not source:
  534. return ""
  535. accumulated = ""
  536. atoms = _atomize(source)
  537. for atom_kind, atom_text in atoms:
  538. if not atom_text:
  539. continue
  540. if atom_kind in {"drawing", "equation"}:
  541. candidate = accumulated + atom_text
  542. if _count_tokens(tokenizer, candidate) <= max_tokens:
  543. accumulated = candidate
  544. continue
  545. break
  546. if atom_kind == "table":
  547. candidate = accumulated + atom_text
  548. if _count_tokens(tokenizer, candidate) <= max_tokens:
  549. accumulated = candidate
  550. continue
  551. remaining = max_tokens - _count_tokens(tokenizer, accumulated)
  552. if remaining > 0:
  553. trimmed = _row_trim_table_trailing(atom_text, remaining, tokenizer)
  554. if trimmed is not None:
  555. accumulated = accumulated + trimmed
  556. break
  557. addition = _accumulate_text_trailing(
  558. atom_text,
  559. existing=accumulated,
  560. tokenizer=tokenizer,
  561. max_tokens=max_tokens,
  562. separators=separators,
  563. )
  564. if addition is None:
  565. break
  566. accumulated = accumulated + addition
  567. if _count_tokens(tokenizer, accumulated) >= max_tokens:
  568. break
  569. return accumulated
  570. def _accumulate_text_trailing(
  571. text: str,
  572. *,
  573. existing: str,
  574. tokenizer: Tokenizer,
  575. max_tokens: int,
  576. separators: list[str],
  577. ) -> str | None:
  578. segments, sep_idx = _split_text_segment(text, separators)
  579. if not segments:
  580. return None
  581. buf = ""
  582. for i, seg in enumerate(segments):
  583. candidate = buf + seg
  584. if _count_tokens(tokenizer, existing + candidate) <= max_tokens:
  585. buf = candidate
  586. continue
  587. if buf:
  588. return buf
  589. weaker = separators[sep_idx + 1 :] if sep_idx < len(separators) else []
  590. if weaker:
  591. return _accumulate_text_trailing(
  592. seg,
  593. existing=existing,
  594. tokenizer=tokenizer,
  595. max_tokens=max_tokens,
  596. separators=weaker,
  597. )
  598. remaining = max_tokens - _count_tokens(tokenizer, existing)
  599. if remaining <= 0:
  600. return None
  601. trimmed = _char_trim_trailing(seg, remaining, tokenizer)
  602. return trimmed if trimmed else None
  603. return buf if buf else None
  604. # ---------------------------------------------------------------------------
  605. # Public entrypoints.
  606. # ---------------------------------------------------------------------------
  607. def load_chunk_separators() -> list[str]:
  608. """Resolve the recursive-character separator cascade.
  609. Reads ``CHUNK_R_SEPARATORS`` and falls back to
  610. :data:`lightrag.constants.DEFAULT_R_SEPARATORS` on missing / invalid
  611. JSON. The returned list always has the empty-string sentinel
  612. dropped — char fallback is signalled separately by the caller.
  613. """
  614. raw = os.getenv("CHUNK_R_SEPARATORS")
  615. separators: list[str]
  616. if raw:
  617. try:
  618. parsed = json.loads(raw)
  619. if isinstance(parsed, list) and all(isinstance(s, str) for s in parsed):
  620. separators = parsed
  621. else:
  622. separators = list(DEFAULT_R_SEPARATORS)
  623. except json.JSONDecodeError:
  624. separators = list(DEFAULT_R_SEPARATORS)
  625. else:
  626. separators = list(DEFAULT_R_SEPARATORS)
  627. return [s for s in separators if s]
  628. def load_content_rows_by_blockid(blocks_path: str) -> dict[str, str]:
  629. """Read ``blocks.jsonl`` and return ``{blockid: content_str}``.
  630. Only ``type == "content"`` rows are kept. When the same blockid
  631. appears multiple times, the first occurrence wins.
  632. """
  633. rows: dict[str, str] = {}
  634. path = Path(blocks_path)
  635. if not path.exists():
  636. return rows
  637. with path.open("r", encoding="utf-8") as fh:
  638. for line in fh:
  639. line = line.strip()
  640. if not line:
  641. continue
  642. try:
  643. obj = json.loads(line)
  644. except json.JSONDecodeError:
  645. continue
  646. if not isinstance(obj, dict):
  647. continue
  648. if obj.get("type") != "content":
  649. continue
  650. blockid = obj.get("blockid")
  651. if not isinstance(blockid, str) or not blockid:
  652. continue
  653. if blockid in rows:
  654. continue
  655. content = obj.get("content")
  656. if isinstance(content, str):
  657. rows[blockid] = content
  658. return rows
  659. DEFAULT_SURROUNDING_MAX_TOKENS = 2000
  660. def _resolve_surrounding_budget(
  661. leading_max_tokens: int | None,
  662. trailing_max_tokens: int | None,
  663. ) -> tuple[int, int]:
  664. """Resolve per-half token budgets, defaulting to env vars then 2000.
  665. Reads ``SURROUNDING_LEADING_MAX_TOKENS`` / ``SURROUNDING_TRAILING_MAX_TOKENS``
  666. when the caller passes ``None``. Invalid env values fall back to
  667. :data:`DEFAULT_SURROUNDING_MAX_TOKENS`.
  668. """
  669. def _from_env(env_var: str) -> int:
  670. raw = os.getenv(env_var)
  671. if raw is None or not raw.strip():
  672. return DEFAULT_SURROUNDING_MAX_TOKENS
  673. try:
  674. value = int(raw)
  675. except ValueError:
  676. logger.warning(
  677. "[multimodal_context] invalid %s=%r; falling back to %d",
  678. env_var,
  679. raw,
  680. DEFAULT_SURROUNDING_MAX_TOKENS,
  681. )
  682. return DEFAULT_SURROUNDING_MAX_TOKENS
  683. return max(0, value)
  684. leading = (
  685. leading_max_tokens
  686. if leading_max_tokens is not None
  687. else _from_env("SURROUNDING_LEADING_MAX_TOKENS")
  688. )
  689. trailing = (
  690. trailing_max_tokens
  691. if trailing_max_tokens is not None
  692. else _from_env("SURROUNDING_TRAILING_MAX_TOKENS")
  693. )
  694. return leading, trailing
  695. _CONTENT_TRUNCATION_MARKER = (
  696. "\n<!-- content truncated from {original} to {final} tokens, head preserved -->"
  697. )
  698. def trim_content_to_budget(
  699. content: str,
  700. *,
  701. kind: str,
  702. max_tokens: int,
  703. tokenizer: Tokenizer | None,
  704. ) -> tuple[str, bool]:
  705. """Trim sidecar ``content`` to fit within ``max_tokens``, preserving the head.
  706. Used by ``analyze_multimodal`` to keep the EXTRACT-role prompt within
  707. :data:`lightrag.constants.DEFAULT_MAX_EXTRACT_INPUT_TOKENS`. Only ``content``
  708. is compressed — surrounding/captions/footnotes already have their own caps
  709. and the prompt template is fixed.
  710. Strategy:
  711. - ``tables`` (``<table>…</table>`` wrapped): row-aware trim via
  712. :func:`_row_trim_table_trailing` (keep head rows / first k <tr>);
  713. falls back to ``_char_fallback_*`` (still ``<table>``-wrapped) when
  714. no single row fits. Non-``<table>`` content falls through to char
  715. trim from the tail.
  716. - ``equations`` / other: :func:`_char_trim_trailing` (keep head chars).
  717. A trailing HTML-comment marker is appended *outside* the ``<table>``
  718. wrapper (when trimmed) so the LLM knows the body is incomplete. The
  719. marker is included in the token budget.
  720. Returns ``(possibly_trimmed_content, was_trimmed)``. When
  721. ``max_tokens <= 0`` or ``tokenizer is None`` the input is returned
  722. unchanged with ``was_trimmed=False``.
  723. """
  724. if not content or tokenizer is None or max_tokens <= 0:
  725. return content, False
  726. original_tokens = _count_tokens(tokenizer, content)
  727. if original_tokens <= max_tokens:
  728. return content, False
  729. # Reserve token room for the truncation marker before trimming.
  730. marker_probe = _CONTENT_TRUNCATION_MARKER.format(
  731. original=original_tokens, final=max_tokens
  732. )
  733. marker_tokens = _count_tokens(tokenizer, marker_probe)
  734. inner_budget = max(0, max_tokens - marker_tokens)
  735. trimmed_inner: str | None = None
  736. if kind == "tables" and TABLE_TAG_RE.match(content.strip()):
  737. # _row_trim_table_trailing keeps head rows and internally falls back
  738. # to char-level fits while preserving the <table> wrapper. Only
  739. # malformed / unrecognized-format markup returns None.
  740. trimmed_inner = _row_trim_table_trailing(content, inner_budget, tokenizer)
  741. if trimmed_inner is None:
  742. trimmed_inner = _char_trim_trailing(content, inner_budget, tokenizer)
  743. final_tokens = _count_tokens(tokenizer, trimmed_inner)
  744. marker = _CONTENT_TRUNCATION_MARKER.format(
  745. original=original_tokens, final=final_tokens
  746. )
  747. return trimmed_inner + marker, True
  748. def build_surrounding(
  749. *,
  750. kind: str,
  751. block_content: str,
  752. span: tuple[int, int],
  753. tokenizer: Tokenizer,
  754. leading_max_tokens: int,
  755. trailing_max_tokens: int,
  756. separators: list[str],
  757. ) -> dict[str, str]:
  758. """Compute ``{"leading": …, "trailing": …}`` for one sidecar entry.
  759. ``leading_max_tokens`` and ``trailing_max_tokens`` are independent
  760. per-half caps so deployments can tune the two contexts separately
  761. via ``SURROUNDING_LEADING_MAX_TOKENS`` / ``SURROUNDING_TRAILING_MAX_TOKENS``.
  762. The returned strings have parser-internal markers (``id`` / ``path``
  763. / ``src`` / ``refid``) stripped — the cleaning happens before
  764. token-budgeted truncation inside :func:`_build_leading` /
  765. :func:`_build_trailing`, so the budget reflects the LLM-visible
  766. content and truncation cannot leave malformed tags behind.
  767. """
  768. start, end = span
  769. leading_src = block_content[:start]
  770. trailing_src = block_content[end:]
  771. leading = _build_leading(
  772. leading_src,
  773. kind=kind,
  774. tokenizer=tokenizer,
  775. max_tokens=leading_max_tokens,
  776. separators=separators,
  777. )
  778. trailing = _build_trailing(
  779. trailing_src,
  780. kind=kind,
  781. tokenizer=tokenizer,
  782. max_tokens=trailing_max_tokens,
  783. separators=separators,
  784. )
  785. return {"leading": leading, "trailing": trailing}
  786. def enrich_sidecars_with_surrounding(
  787. *,
  788. blocks_path: str,
  789. enabled_modalities: set[str],
  790. tokenizer: Tokenizer,
  791. leading_max_tokens: int | None = None,
  792. trailing_max_tokens: int | None = None,
  793. separators: list[str] | None = None,
  794. ) -> dict[str, int]:
  795. """Backfill ``surrounding`` on enabled-modality sidecars.
  796. Args:
  797. blocks_path: path to the ``…blocks.jsonl`` artifact.
  798. enabled_modalities: subset of ``{"drawings", "tables",
  799. "equations"}`` reflecting the document's ``process_options``.
  800. tokenizer: tokenizer used to enforce the per-half token budget.
  801. leading_max_tokens: leading-half cap. ``None`` reads
  802. ``SURROUNDING_LEADING_MAX_TOKENS`` (default 2000).
  803. trailing_max_tokens: trailing-half cap. ``None`` reads
  804. ``SURROUNDING_TRAILING_MAX_TOKENS`` (default 2000).
  805. separators: explicit separator cascade. Defaults to the cascade
  806. resolved from ``CHUNK_R_SEPARATORS`` (or
  807. ``DEFAULT_R_SEPARATORS``).
  808. Returns:
  809. ``{modality: updated_entries}`` for diagnostics. Modalities
  810. without a sidecar on disk are silently skipped (consistent with
  811. the rest of the multimodal pipeline).
  812. """
  813. counts = {"drawings": 0, "tables": 0, "equations": 0}
  814. if not enabled_modalities:
  815. return counts
  816. blocks_file = Path(blocks_path)
  817. if not blocks_file.exists():
  818. return counts
  819. content_by_blockid = load_content_rows_by_blockid(blocks_path)
  820. if separators is None:
  821. separators = load_chunk_separators()
  822. leading_tokens, trailing_tokens = _resolve_surrounding_budget(
  823. leading_max_tokens, trailing_max_tokens
  824. )
  825. base = str(blocks_file)
  826. if base.endswith(".blocks.jsonl"):
  827. base = base[: -len(".blocks.jsonl")]
  828. for root_key in ("drawings", "tables", "equations"):
  829. if root_key not in enabled_modalities:
  830. continue
  831. sidecar_path = Path(base + f".{root_key}.json")
  832. if not sidecar_path.exists():
  833. continue
  834. try:
  835. payload = json.loads(sidecar_path.read_text(encoding="utf-8"))
  836. except (OSError, json.JSONDecodeError) as exc:
  837. logger.warning(
  838. "[multimodal_context] failed to read %s: %s",
  839. sidecar_path,
  840. exc,
  841. )
  842. continue
  843. items = payload.get(root_key)
  844. if not isinstance(items, dict):
  845. continue
  846. updated = 0
  847. for item_id, item in items.items():
  848. if not isinstance(item, dict):
  849. continue
  850. blockid = item.get("blockid")
  851. if not isinstance(blockid, str) or not blockid:
  852. continue
  853. block_content = content_by_blockid.get(blockid)
  854. if block_content is None:
  855. continue
  856. span = find_target_span(root_key, item_id, block_content)
  857. if span is None:
  858. logger.debug(
  859. "[multimodal_context] %s/%s: id not found in block %s",
  860. root_key,
  861. item_id,
  862. blockid,
  863. )
  864. continue
  865. surrounding = build_surrounding(
  866. kind=root_key,
  867. block_content=block_content,
  868. span=span,
  869. tokenizer=tokenizer,
  870. leading_max_tokens=leading_tokens,
  871. trailing_max_tokens=trailing_tokens,
  872. separators=separators,
  873. )
  874. item["surrounding"] = surrounding
  875. updated += 1
  876. counts[root_key] = updated
  877. try:
  878. sidecar_path.write_text(
  879. json.dumps(payload, ensure_ascii=False, indent=2),
  880. encoding="utf-8",
  881. )
  882. except OSError as exc:
  883. logger.warning(
  884. "[multimodal_context] failed to write %s: %s",
  885. sidecar_path,
  886. exc,
  887. )
  888. continue
  889. logger.debug(
  890. "[multimodal_context] %s: surrounding written for %d entries",
  891. root_key,
  892. updated,
  893. )
  894. return counts
  895. __all__ = [
  896. "DEFAULT_SURROUNDING_MAX_TOKENS",
  897. "build_surrounding",
  898. "enrich_sidecars_with_surrounding",
  899. "find_target_span",
  900. "load_chunk_separators",
  901. "load_content_rows_by_blockid",
  902. "remove_table_tags",
  903. "trim_content_to_budget",
  904. ]