paragraph_semantic.py 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503
  1. """Paragraph Semantic Chunking for LightRAG.
  2. Reads a LightRAG ``.blocks.jsonl`` artifact (produced by the docx native
  3. parser at ``fixlevel=0`` — heading-driven splits only, tables kept whole)
  4. and produces a chunk list compatible with
  5. :func:`lightrag.chunker.chunking_by_token_size`.
  6. The full algorithm and rationale are documented in
  7. ``docs/ParagraphSemanticChunking-zh.md``. This module re-implements the
  8. post-Stage-A pipeline (B/C/D) on top of blocks.jsonl input, parameterised
  9. on ``chunk_token_size`` so chunk size targets follow the user's RAG
  10. configuration rather than the audit-mode constants in
  11. ``lightrag/parser/docx/parse_document.py``.
  12. Pipeline:
  13. - Stage A — heading-driven initial split: already done at parse time and
  14. persisted as one row per block in ``blocks.jsonl``.
  15. - Stage B — oversized-table re-split + first/middle/last gluing: invoked
  16. here when an embedded ``<table … format="json">`` (or
  17. ``format="html"``) exceeds ``TABLE_MAX_TOKENS``. Splitting prefers
  18. structural row boundaries (JSON list items, HTML ``<tr>`` rows) so
  19. each fragment remains a legal ``<table>`` tag; only when no row
  20. boundary is available, or a single row alone exceeds the cap, does
  21. the splitter fall back to ``chunking_by_recursive_character`` on
  22. that specific fragment. When two oversized tables are separated by
  23. text inside the same heading block, the bridge text may be duplicated
  24. into both table boundary chunks so each table keeps nearby context.
  25. - Stage C — anchor-driven long-block re-split: short non-table
  26. paragraphs (≤ 100 chars) are promoted as split points and the block
  27. is rebalanced toward ``IDEAL_BLOCK_TOKENS``. When no anchor exists,
  28. table-aware fallback applies the same row-boundary-first strategy
  29. to any oversized table paragraph and only character-splits the
  30. residual non-table content. Character fallback for ordinary text uses
  31. the configured paragraph-semantic overlap.
  32. - Stage D — bottom-up, level-aware small-block merging: undersized
  33. blocks get absorbed by same-level neighbours (Phase A), shallower
  34. levels (Phase B), and a final tail-absorption pass eliminates the
  35. last few zero-content remainders.
  36. """
  37. from __future__ import annotations
  38. import json
  39. import math
  40. import re
  41. from pathlib import Path
  42. from typing import Any, Callable
  43. from lightrag.table_markup import (
  44. TABLE_TAG_RE as _TABLE_TAG_RE,
  45. detect_table_format as _detect_table_format,
  46. serialize_html_rows as _serialize_rows_with_wrappers,
  47. split_html_rows as _split_html_rows,
  48. )
  49. from lightrag.utils import Tokenizer, logger
  50. # ---------------------------------------------------------------------------
  51. # Threshold ratios — derived from the audit-mode constants in
  52. # lightrag/parser/docx/parse_document.py so the trade-off curves
  53. # (table vs. block size, ideal vs. max, etc.) carry over verbatim. The
  54. # absolute values scale with the user-configured ``chunk_token_size``.
  55. # ---------------------------------------------------------------------------
  56. # IDEAL/MAX = 6000/8000 = 0.75 in audit mode.
  57. _IDEAL_RATIO = 0.75
  58. # TABLE_MAX/MAX = 5000/8000 = 0.625 in audit mode.
  59. _TABLE_MAX_RATIO = 0.625
  60. # TABLE_IDEAL/MAX = 3000/8000 = 0.375 in audit mode.
  61. _TABLE_IDEAL_RATIO = 0.375
  62. # TABLE_MIN_LAST/TABLE_MAX = (TABLE_MAX-TABLE_IDEAL)*0.8/TABLE_MAX
  63. # = (5000-3000)*0.8/5000 = 0.32 in audit mode.
  64. _TABLE_MIN_LAST_RATIO = 0.32
  65. # SMALL_TAIL_THRESHOLD/MAX = (MAX-IDEAL)/2/MAX = 1000/8000 = 0.125.
  66. _SMALL_TAIL_RATIO = 0.125
  67. # Anchor candidate length is a UI/readability constraint — keep absolute.
  68. _MAX_ANCHOR_CANDIDATE_LENGTH = 100 # characters
  69. # Table tag regex (``_TABLE_TAG_RE``) plus the ``_detect_table_format``,
  70. # ``_split_html_rows`` and ``_serialize_rows_with_wrappers`` helpers are
  71. # imported from :mod:`lightrag.table_markup` so the surrounding-context
  72. # extractor can reuse the same primitives.
  73. _LEGACY_TABLE_CHUNK_SUFFIX_RE = re.compile(r"\s*\[表格片段\d+\]\s*$")
  74. _PART_SUFFIX_RE = re.compile(r"\s*\[part\s+\d+\]\s*$", re.IGNORECASE)
  75. # ---------------------------------------------------------------------------
  76. # Shared helpers.
  77. # ---------------------------------------------------------------------------
  78. def _count_tokens(tokenizer: Tokenizer, text: str) -> int:
  79. if not text:
  80. return 0
  81. return len(tokenizer.encode(text))
  82. def _bounded_overlap(target_max: int, chunk_overlap_token_size: int) -> int:
  83. """Return an overlap value safe for recursive-character splitting."""
  84. overlap = max(int(chunk_overlap_token_size), 0)
  85. if target_max <= 1:
  86. return 0
  87. return min(overlap, target_max - 1)
  88. def _strip_generated_heading_suffixes(heading: str) -> str:
  89. """Remove generated split suffixes before assigning a fresh part number."""
  90. cleaned = (heading or "").rstrip()
  91. while True:
  92. next_cleaned = _PART_SUFFIX_RE.sub("", cleaned).rstrip()
  93. next_cleaned = _LEGACY_TABLE_CHUNK_SUFFIX_RE.sub("", next_cleaned).rstrip()
  94. if next_cleaned == cleaned:
  95. return cleaned
  96. cleaned = next_cleaned
  97. def _append_part_suffix(heading: str, part_number: int) -> str:
  98. base = _strip_generated_heading_suffixes(heading)
  99. suffix = f"[part {part_number}]"
  100. return f"{base} {suffix}" if base else suffix
  101. def _apply_part_suffixes(blocks: list[dict[str, Any]]) -> list[dict[str, Any]]:
  102. """Tag split fragments from one original block as ``[part n]``."""
  103. if len(blocks) <= 1:
  104. return blocks
  105. for idx, block in enumerate(blocks, start=1):
  106. block["heading"] = _append_part_suffix(block.get("heading", ""), idx)
  107. return blocks
  108. def _is_table_paragraph(text: str) -> bool:
  109. stripped = text.strip()
  110. return stripped.startswith("<table ") and stripped.endswith("</table>")
  111. def _block_to_paragraphs(content: str) -> list[dict[str, Any]]:
  112. """Recover the per-paragraph view of a rewritten block.
  113. The docx parser joins paragraphs with ``\\n`` inside
  114. ``_build_unsplit_block``; tables/equations/drawings are inserted as
  115. single-line tags with no internal newlines, so ``split("\\n")`` faithfully
  116. recovers paragraph boundaries.
  117. """
  118. paragraphs: list[dict[str, Any]] = []
  119. for line in content.split("\n"):
  120. if not line.strip():
  121. continue
  122. paragraphs.append({"text": line, "is_table": _is_table_paragraph(line)})
  123. return paragraphs
  124. def _load_blocks_from_jsonl(blocks_path: str) -> list[dict[str, Any]]:
  125. """Read ``type == "content"`` rows from a blocks.jsonl file in order."""
  126. rows: list[dict[str, Any]] = []
  127. with Path(blocks_path).open("r", encoding="utf-8") as fh:
  128. for raw in fh:
  129. raw = raw.strip()
  130. if not raw:
  131. continue
  132. try:
  133. obj = json.loads(raw)
  134. except json.JSONDecodeError:
  135. continue
  136. if isinstance(obj, dict) and obj.get("type") == "content":
  137. rows.append(obj)
  138. return rows
  139. def _split_html_rows_by_tokens(
  140. rows: list[tuple[str, str]],
  141. tokenizer: Tokenizer,
  142. *,
  143. target_max: int,
  144. target_ideal: int,
  145. last_min: int,
  146. ) -> list[list[tuple[str, str]]]:
  147. """HTML-tuple analog of :func:`_split_rows_by_tokens`.
  148. Same balanced-split + tail-merge algorithm; tokens are measured on
  149. the row payloads (``tr_str``) only — wrapper overhead is amortised
  150. later by the per-chunk serialiser plus the re-split-on-overflow
  151. safety net in :func:`_split_table_text`.
  152. """
  153. total = _count_tokens(tokenizer, "".join(tr for _, tr in rows))
  154. if total <= target_max or len(rows) <= 1:
  155. return [rows]
  156. target_chunks = max(
  157. math.ceil(total / target_ideal),
  158. math.ceil(total / target_max),
  159. )
  160. target_chunks = min(target_chunks, len(rows))
  161. target_rows = len(rows) / target_chunks
  162. chunks: list[list[tuple[str, str]]] = []
  163. start = 0
  164. for i in range(target_chunks):
  165. if i == target_chunks - 1:
  166. end = len(rows)
  167. else:
  168. end = max(start + 1, min(int((i + 1) * target_rows), len(rows)))
  169. remaining = len(rows) - end
  170. if remaining > 0 and remaining < target_rows * 0.3:
  171. end = len(rows)
  172. chunks.append(rows[start:end])
  173. start = end
  174. if start >= len(rows):
  175. break
  176. if len(chunks) >= 2:
  177. last_text = "".join(tr for _, tr in chunks[-1])
  178. if _count_tokens(tokenizer, last_text) < last_min:
  179. merged = chunks[-2] + chunks[-1]
  180. merged_tokens = _count_tokens(tokenizer, "".join(tr for _, tr in merged))
  181. if merged_tokens <= target_max:
  182. chunks[-2] = merged
  183. chunks.pop()
  184. return chunks
  185. def _dedup_preserving_order(values: list[str]) -> list[str]:
  186. seen: set[str] = set()
  187. out: list[str] = []
  188. for v in values:
  189. if v and v not in seen:
  190. seen.add(v)
  191. out.append(v)
  192. return out
  193. def _new_block(
  194. *,
  195. heading: str,
  196. parent_headings: list[str],
  197. level: int,
  198. paragraphs: list[dict[str, Any]],
  199. table_chunk_role: str,
  200. tokenizer: Tokenizer,
  201. blockids: list[str] | None = None,
  202. ) -> dict[str, Any]:
  203. content = "\n".join(p["text"] for p in paragraphs)
  204. return {
  205. "heading": heading,
  206. "parent_headings": list(parent_headings),
  207. "level": level,
  208. "paragraphs": list(paragraphs),
  209. "content": content,
  210. "tokens": _count_tokens(tokenizer, content),
  211. "table_chunk_role": table_chunk_role,
  212. # Ordered list of source blockids (deduped). Empty when the input
  213. # blocks.jsonl row did not carry a blockid (raw/legacy input).
  214. "blockids": _dedup_preserving_order(list(blockids or [])),
  215. }
  216. # ---------------------------------------------------------------------------
  217. # Stage B — oversized-table re-split with first/middle/last gluing.
  218. # ---------------------------------------------------------------------------
  219. def _split_rows_by_tokens(
  220. rows: list[Any],
  221. tokenizer: Tokenizer,
  222. *,
  223. target_max: int,
  224. target_ideal: int,
  225. last_min: int,
  226. ) -> list[list[Any]]:
  227. """Split ``rows`` into balanced row-bounded chunks (Stage B core)."""
  228. total = _count_tokens(tokenizer, json.dumps(rows, ensure_ascii=False))
  229. if total <= target_max or len(rows) <= 1:
  230. return [rows]
  231. target_chunks = max(
  232. math.ceil(total / target_ideal),
  233. math.ceil(total / target_max),
  234. )
  235. # Cap at len(rows) so target_rows >= 1; otherwise int((i+1)*target_rows)
  236. # can collapse to ``start`` and emit empty <table>[]</table> slices.
  237. target_chunks = min(target_chunks, len(rows))
  238. target_rows = len(rows) / target_chunks
  239. chunks: list[list[Any]] = []
  240. start = 0
  241. for i in range(target_chunks):
  242. if i == target_chunks - 1:
  243. end = len(rows)
  244. else:
  245. # max(start + 1, ...) guarantees forward progress (>= 1 row per
  246. # slice) even at fractional target_rows boundaries.
  247. end = max(start + 1, min(int((i + 1) * target_rows), len(rows)))
  248. remaining = len(rows) - end
  249. if remaining > 0 and remaining < target_rows * 0.3:
  250. end = len(rows)
  251. chunks.append(rows[start:end])
  252. start = end
  253. if start >= len(rows):
  254. break
  255. # Merge a tiny last chunk back into the previous chunk when feasible.
  256. if len(chunks) >= 2:
  257. last_json = json.dumps(chunks[-1], ensure_ascii=False)
  258. if _count_tokens(tokenizer, last_json) < last_min:
  259. merged = chunks[-2] + chunks[-1]
  260. merged_tokens = _count_tokens(
  261. tokenizer, json.dumps(merged, ensure_ascii=False)
  262. )
  263. if merged_tokens <= target_max:
  264. chunks[-2] = merged
  265. chunks.pop()
  266. return chunks
  267. def _character_split_text(
  268. text: str,
  269. tokenizer: Tokenizer,
  270. *,
  271. target_max: int,
  272. chunk_overlap_token_size: int = 0,
  273. ) -> list[str]:
  274. """Character-level fallback wrapped to return plain-text pieces.
  275. Lazy import dodges the ``recursive_character`` ↔ ``paragraph_semantic``
  276. circular dependency (same pattern as the sidecar-missing fallback in
  277. :func:`chunking_by_paragraph_semantic`). Callers that split ordinary
  278. prose pass the paragraph-semantic overlap; table character fallbacks
  279. leave the default at zero so structured table row chunks do not gain
  280. implicit row-level overlap.
  281. """
  282. from lightrag.chunker.recursive_character import (
  283. chunking_by_recursive_character,
  284. )
  285. pieces = chunking_by_recursive_character(
  286. tokenizer,
  287. text,
  288. target_max,
  289. chunk_overlap_token_size=_bounded_overlap(target_max, chunk_overlap_token_size),
  290. )
  291. return [p["content"] for p in pieces if p.get("content")]
  292. def _split_table_text(
  293. table_text: str,
  294. *,
  295. tokenizer: Tokenizer,
  296. target_max: int,
  297. target_ideal: int,
  298. last_min: int,
  299. ) -> list[str]:
  300. """Split a single oversized ``<table>...</table>`` text into ≤ target_max pieces.
  301. Strategy (mirrors the user-supplied contract in
  302. ``docs/ParagraphSemanticChunking-zh.md`` — row boundary first,
  303. character fallback last):
  304. 1. Match the outer ``<table {attrs}>{body}</table>``. If the regex
  305. fails, character-split the original text and return.
  306. 2. Detect the body format via :func:`_detect_table_format` (with
  307. body sniffing when ``attrs`` is silent).
  308. 3. Row-boundary split: JSON via :func:`_split_rows_by_tokens`,
  309. HTML via :func:`_split_html_rows_by_tokens`. Re-wrap every
  310. row-chunk as ``<table {attrs}>{rows}</table>``.
  311. 4. For any wrapped chunk still exceeding ``target_max``
  312. (single-row chunks where the row alone exceeds the cap, or
  313. row-split returned a single chunk because rows were ≤ 1),
  314. character-fallback that specific chunk's text.
  315. 5. Unknown / unparseable format → character-fallback the entire
  316. original text.
  317. Output strings are either:
  318. - a re-wrapped ``<table {attrs}>{rows}</table>`` (legal markup,
  319. callers may keep ``is_table=True`` for these), or
  320. - a character-fallback fragment (no ``<table>`` wrapper, callers
  321. should mark ``is_table=False``).
  322. """
  323. match = _TABLE_TAG_RE.match((table_text or "").strip())
  324. if not match:
  325. return _character_split_text(table_text, tokenizer, target_max=target_max)
  326. attrs = match.group("attrs")
  327. body = match.group("body")
  328. fmt = _detect_table_format(attrs, body)
  329. # Budget the <table {attrs}></table> wrapper out of the per-chunk
  330. # caps before calling the row splitter — the splitter only measures
  331. # the body (json.dumps(rows) / "".join(rows)), so without this the
  332. # wrapped chunk can exceed target_max purely from the wrapper, which
  333. # would force a needless character-fallback below.
  334. wrapper_overhead = _count_tokens(tokenizer, f"<table {attrs}></table>")
  335. body_max = max(target_max - wrapper_overhead, 1)
  336. body_ideal = max(min(target_ideal, target_max) - wrapper_overhead, 1)
  337. body_last_min = max(last_min - wrapper_overhead, 1)
  338. row_chunks: list[list[Any]] | None = None
  339. serialize: Callable[[list[Any]], str] | None = None
  340. if fmt == "json":
  341. try:
  342. rows = json.loads(body)
  343. except json.JSONDecodeError:
  344. rows = None
  345. if isinstance(rows, list) and len(rows) > 1:
  346. row_chunks = _split_rows_by_tokens(
  347. rows,
  348. tokenizer,
  349. target_max=body_max,
  350. target_ideal=body_ideal,
  351. last_min=body_last_min,
  352. )
  353. def serialize(chunk_rows: list[Any]) -> str:
  354. return (
  355. f"<table {attrs}>"
  356. f"{json.dumps(chunk_rows, ensure_ascii=False)}"
  357. f"</table>"
  358. )
  359. elif fmt == "html":
  360. rows_html = _split_html_rows(body)
  361. if rows_html and len(rows_html) > 1:
  362. row_chunks = _split_html_rows_by_tokens(
  363. rows_html,
  364. tokenizer,
  365. target_max=body_max,
  366. target_ideal=body_ideal,
  367. last_min=body_last_min,
  368. )
  369. def serialize(chunk_rows: list[tuple[str, str]]) -> str:
  370. return (
  371. f"<table {attrs}>"
  372. f"{_serialize_rows_with_wrappers(chunk_rows)}"
  373. f"</table>"
  374. )
  375. if row_chunks is None or serialize is None:
  376. # No row boundary available (single-row table, parse failure,
  377. # unknown format) → character-fallback the whole text.
  378. return _character_split_text(table_text, tokenizer, target_max=target_max)
  379. # Re-split any chunk whose wrapped form still exceeds target_max
  380. # before resorting to character-level shredding. The row splitter's
  381. # balanced-cut heuristic can produce uneven chunks when row sizes
  382. # vary, and only a chunk that has collapsed to a single row (where
  383. # row-boundary splitting can no longer reduce it) belongs in the
  384. # character fallback.
  385. pieces: list[str] = []
  386. pending: list[list[Any]] = list(row_chunks)
  387. while pending:
  388. chunk_rows = pending.pop(0)
  389. wrapped = serialize(chunk_rows)
  390. if _count_tokens(tokenizer, wrapped) <= target_max:
  391. pieces.append(wrapped)
  392. continue
  393. if len(chunk_rows) <= 1:
  394. pieces.extend(
  395. _character_split_text(wrapped, tokenizer, target_max=target_max)
  396. )
  397. continue
  398. # Force a finer cut: cap the next-pass body budget at half the
  399. # current wrapped size so target_chunks >= 2 inside the splitter.
  400. # This guarantees forward progress (one row at minimum per
  401. # sub-chunk, see the splitter's len(rows) cap).
  402. halved = max(_count_tokens(tokenizer, wrapped) // 2, 1)
  403. sub_max = max(min(body_max, halved), 1)
  404. sub_ideal = max(sub_max // 2, 1)
  405. sub_last_min = max(min(body_last_min, sub_max // 2), 1)
  406. if fmt == "json":
  407. sub_chunks = _split_rows_by_tokens(
  408. chunk_rows,
  409. tokenizer,
  410. target_max=sub_max,
  411. target_ideal=sub_ideal,
  412. last_min=sub_last_min,
  413. )
  414. else:
  415. sub_chunks = _split_html_rows_by_tokens(
  416. chunk_rows,
  417. tokenizer,
  418. target_max=sub_max,
  419. target_ideal=sub_ideal,
  420. last_min=sub_last_min,
  421. )
  422. if len(sub_chunks) <= 1:
  423. # The splitter could not reduce further (e.g. one row already
  424. # dominates the body). Avoid an infinite loop and let the
  425. # character fallback handle this stubborn chunk.
  426. pieces.extend(
  427. _character_split_text(wrapped, tokenizer, target_max=target_max)
  428. )
  429. continue
  430. # Process the finer cuts before any remaining peer chunks so the
  431. # output keeps source order.
  432. pending[0:0] = sub_chunks
  433. return pieces
  434. def _expand_block_with_table_splits(
  435. block: dict[str, Any],
  436. *,
  437. tokenizer: Tokenizer,
  438. table_max: int,
  439. table_ideal: int,
  440. table_min_last: int,
  441. target_max: int | None = None,
  442. chunk_overlap_token_size: int = 0,
  443. ) -> list[dict[str, Any]]:
  444. """Apply Stage B to one heading-driven block.
  445. For every embedded table whose tokens exceed ``table_max``:
  446. - the first row-slice glues with paragraphs already accumulated in
  447. the current expansion (i.e. content *before* the table);
  448. - middle slices are emitted as standalone blocks tagged
  449. ``table_chunk_role == "middle"`` so Stage D refuses to merge them;
  450. - the last slice begins a fresh accumulation that will glue with
  451. paragraphs *after* the table.
  452. When a ``last`` table slice is followed by short bridge text and then
  453. another oversized table's ``first`` slice, the bridge text is split
  454. into table boundary context: a prefix may be duplicated into the
  455. previous table block and a suffix into the next table block. If the
  456. bridge is longer than both context budgets, the remaining middle text
  457. is emitted as a standalone text block. Tables within the size limit
  458. pass through untouched.
  459. """
  460. if target_max is None:
  461. target_max = table_max
  462. target_max = max(int(target_max), 1)
  463. context_overlap = _bounded_overlap(target_max, chunk_overlap_token_size)
  464. sep_tokens = _count_tokens(tokenizer, "\n")
  465. paragraphs = block["paragraphs"]
  466. has_oversized_table = any(
  467. p["is_table"] and _count_tokens(tokenizer, p["text"]) > table_max
  468. for p in paragraphs
  469. )
  470. if not has_oversized_table:
  471. return [block]
  472. out: list[dict[str, Any]] = []
  473. cur_paras: list[dict[str, Any]] = []
  474. # Role to assign to ``cur_paras`` when it next flushes. Tracks the
  475. # boundary semantics across split-table iterations so the merged
  476. # block carries "first" / "last" instead of defaulting to "none" —
  477. # otherwise Stage D's directional protections (a "first" block must
  478. # not absorb backward, a "last" block must not absorb forward) silently
  479. # disappear after the slice glues with surrounding paragraphs.
  480. cur_role = "none"
  481. def flush_cur() -> None:
  482. nonlocal cur_role
  483. if not cur_paras:
  484. cur_role = "none"
  485. return
  486. out.append(
  487. _new_block(
  488. heading=block["heading"],
  489. parent_headings=block["parent_headings"],
  490. level=block["level"],
  491. paragraphs=cur_paras,
  492. table_chunk_role=cur_role,
  493. tokenizer=tokenizer,
  494. blockids=block.get("blockids"),
  495. )
  496. )
  497. cur_paras.clear()
  498. cur_role = "none"
  499. def _append_bridge_block(
  500. paragraphs: list[dict[str, Any]],
  501. table_chunk_role: str,
  502. ) -> None:
  503. if not paragraphs:
  504. return
  505. out.append(
  506. _new_block(
  507. heading=block["heading"],
  508. parent_headings=block["parent_headings"],
  509. level=block["level"],
  510. paragraphs=paragraphs,
  511. table_chunk_role=table_chunk_role,
  512. tokenizer=tokenizer,
  513. blockids=block.get("blockids"),
  514. )
  515. )
  516. def _text_paragraph(text: str) -> dict[str, Any] | None:
  517. if not text or not text.strip():
  518. return None
  519. return {"text": text, "is_table": False}
  520. def _context_capacity(base_paras: list[dict[str, Any]]) -> int:
  521. if context_overlap <= 0:
  522. return 0
  523. base_text = "\n".join(p["text"] for p in base_paras)
  524. base_tokens = _count_tokens(tokenizer, base_text)
  525. if base_tokens >= target_max:
  526. return 0
  527. # The context paragraph is joined to the table fragment with "\n".
  528. return max(min(context_overlap, target_max - base_tokens - sep_tokens), 0)
  529. def _flush_last_bridge_before_next_first(
  530. next_first_para: dict[str, Any],
  531. ) -> list[dict[str, Any]]:
  532. """Flush ``last + bridge`` before a following table ``first``.
  533. Returns context paragraphs to prepend to the following first-table
  534. block. Only non-table bridge paragraphs are duplicated/sliced; if
  535. the bridge contains tables we keep the prior non-overlapping flush.
  536. """
  537. nonlocal cur_role
  538. if not cur_paras:
  539. cur_role = "none"
  540. return []
  541. seed_paras = [cur_paras[0]]
  542. bridge_paras = cur_paras[1:]
  543. if (
  544. context_overlap <= 0
  545. or not bridge_paras
  546. or any(p.get("is_table", False) for p in bridge_paras)
  547. ):
  548. flush_cur()
  549. return []
  550. bridge_text = "\n".join(p["text"] for p in bridge_paras)
  551. bridge_tokens = tokenizer.encode(bridge_text)
  552. if not bridge_tokens:
  553. flush_cur()
  554. return []
  555. prev_budget = _context_capacity(seed_paras)
  556. next_budget = _context_capacity([next_first_para])
  557. bridge_len = len(bridge_tokens)
  558. if bridge_len <= prev_budget and bridge_len <= next_budget:
  559. prefix_text = bridge_text
  560. suffix_text = bridge_text
  561. middle_text = ""
  562. else:
  563. prefix_len = min(prev_budget, bridge_len)
  564. suffix_len = min(next_budget, bridge_len)
  565. middle_start = prefix_len
  566. middle_end = max(middle_start, bridge_len - suffix_len)
  567. prefix_text = (
  568. tokenizer.decode(bridge_tokens[:prefix_len]) if prefix_len else ""
  569. )
  570. suffix_text = (
  571. tokenizer.decode(bridge_tokens[bridge_len - suffix_len :])
  572. if suffix_len
  573. else ""
  574. )
  575. middle_text = (
  576. tokenizer.decode(bridge_tokens[middle_start:middle_end])
  577. if middle_end > middle_start
  578. else ""
  579. )
  580. prev_paras = list(seed_paras)
  581. prefix_para = _text_paragraph(prefix_text)
  582. if prefix_para is not None:
  583. prev_paras.append(prefix_para)
  584. _append_bridge_block(prev_paras, "last")
  585. middle_para = _text_paragraph(middle_text)
  586. if middle_para is not None:
  587. _append_bridge_block([middle_para], "none")
  588. cur_paras.clear()
  589. cur_role = "none"
  590. suffix_para = _text_paragraph(suffix_text)
  591. return [suffix_para] if suffix_para is not None else []
  592. for para in paragraphs:
  593. text = para["text"]
  594. if not (para["is_table"] and _count_tokens(tokenizer, text) > table_max):
  595. cur_paras.append(para)
  596. continue
  597. # Row-boundary first, character fallback last. ``_split_table_text``
  598. # returns one or more strings: row-wrapped ``<table>...</table>``
  599. # fragments where row-splitting succeeded, plain text where it
  600. # had to character-split (single-row tables, parse failures,
  601. # rows whose own size exceeded ``table_max``).
  602. pieces = _split_table_text(
  603. text,
  604. tokenizer=tokenizer,
  605. target_max=table_max,
  606. target_ideal=table_ideal,
  607. last_min=table_min_last,
  608. )
  609. if len(pieces) <= 1:
  610. # No reduction was possible (e.g. very small unparseable table
  611. # that already fits within ``table_max`` after a no-op character
  612. # fallback). Keep the original paragraph to preserve content.
  613. cur_paras.append(para)
  614. continue
  615. for chunk_idx, piece_text in enumerate(pieces):
  616. stripped = piece_text.strip()
  617. is_still_table = stripped.startswith("<table ") and stripped.endswith(
  618. "</table>"
  619. )
  620. chunk_para = {"text": piece_text, "is_table": is_still_table}
  621. is_first = chunk_idx == 0
  622. is_last = chunk_idx == len(pieces) - 1
  623. if is_first:
  624. # First slice glues with everything currently accumulated
  625. # (= the paragraphs that appeared before the table inside
  626. # this heading block). If the buffer still carries the
  627. # "last" tail of a previous oversized table, flush it first
  628. # so its protective role survives instead of being
  629. # overwritten by "first".
  630. if cur_role == "last":
  631. cur_paras.extend(_flush_last_bridge_before_next_first(chunk_para))
  632. cur_paras.append(chunk_para)
  633. cur_role = "first"
  634. elif is_last:
  635. # Flush the accumulated "first-glued" block, then begin a
  636. # new accumulation seeded with this last slice — it will
  637. # absorb the paragraphs that appear after the table.
  638. flush_cur()
  639. cur_paras.append(chunk_para)
  640. cur_role = "last"
  641. else:
  642. # Middle slice: flush the first-glued block, then emit
  643. # this middle slice as a standalone block that Stage D
  644. # MUST keep intact (table_chunk_role == "middle").
  645. flush_cur()
  646. out.append(
  647. _new_block(
  648. heading=block["heading"],
  649. parent_headings=block["parent_headings"],
  650. level=block["level"],
  651. paragraphs=[chunk_para],
  652. table_chunk_role="middle",
  653. tokenizer=tokenizer,
  654. blockids=block.get("blockids"),
  655. )
  656. )
  657. flush_cur()
  658. return out
  659. # ---------------------------------------------------------------------------
  660. # Stage C — anchor-driven long-block re-split.
  661. # ---------------------------------------------------------------------------
  662. def _split_long_block(
  663. paragraphs: list[dict[str, Any]],
  664. heading: str,
  665. parent_headings: list[str],
  666. level: int,
  667. table_chunk_role: str,
  668. *,
  669. tokenizer: Tokenizer,
  670. target_max: int,
  671. target_ideal: int,
  672. chunk_overlap_token_size: int = 100,
  673. blockids: list[str] | None = None,
  674. ) -> list[dict[str, Any]]:
  675. """Split an oversized block into balanced sub-blocks at short-paragraph anchors.
  676. Mirrors :func:`lightrag.parser.docx.parse_document.split_long_block`,
  677. parameterised on ``target_max`` / ``target_ideal``. Tables (``is_table``)
  678. are excluded from the anchor candidate pool, so Stage B's row-level
  679. splits stay intact. When no anchor exists (including the single-
  680. paragraph oversized case), the no-anchor branch below honors the cap
  681. via row-boundary splitting (for tables) or character-level splitting
  682. (for prose). The audit-mode parser would ``sys.exit(1)`` on no-anchor
  683. failure, but the RAG pipeline must never drop a document silently.
  684. Character-level splitting of ordinary prose uses
  685. ``chunk_overlap_token_size`` so long text under one JSONL content row
  686. keeps semantic continuity across adjacent chunks.
  687. """
  688. chunk_overlap_token_size = _bounded_overlap(target_max, chunk_overlap_token_size)
  689. content = "\n".join(p["text"] for p in paragraphs)
  690. total = _count_tokens(tokenizer, content)
  691. if total <= target_max:
  692. return [
  693. _new_block(
  694. heading=heading,
  695. parent_headings=parent_headings,
  696. level=level,
  697. paragraphs=paragraphs,
  698. table_chunk_role=table_chunk_role,
  699. tokenizer=tokenizer,
  700. blockids=blockids,
  701. )
  702. ]
  703. target_blocks = max(
  704. math.ceil(total / target_ideal),
  705. math.ceil(total / target_max),
  706. )
  707. target_size = total / target_blocks
  708. # Build anchor candidates with cumulative token offsets. Index 0 is
  709. # excluded: an anchor at the first paragraph yields an empty leading
  710. # slice and a tail equal to the input, so it cannot divide the block —
  711. # selecting it would re-enter this function with the same arguments
  712. # and recurse until RecursionError.
  713. candidates: list[dict[str, Any]] = []
  714. cumulative = 0
  715. for idx, para in enumerate(paragraphs):
  716. text = para["text"]
  717. if (
  718. idx > 0
  719. and not para.get("is_table", False)
  720. and 0 < len(text) <= _MAX_ANCHOR_CANDIDATE_LENGTH
  721. ):
  722. candidates.append({"index": idx, "text": text, "position": cumulative})
  723. cumulative += _count_tokens(tokenizer, text)
  724. if not candidates:
  725. # All paragraphs in the block are longer than the anchor-length
  726. # cap (typical for dense academic prose: every paragraph is a
  727. # full body section). Anchor-driven splitting cannot proceed,
  728. # but we must NOT emit a single oversized chunk: the
  729. # embedding-time hard fallback uses ``embedding_token_limit``
  730. # (often 8K), not ``chunk_token_size``, so the chunk would
  731. # silently exceed the user-configured size. Prefer
  732. # row-boundary splitting on any oversized table paragraph
  733. # before falling back to character-level splitting on residual
  734. # content — character splitting destroys ``<table>`` markup
  735. # mid-tag and produces fragments LLMs can't interpret as
  736. # tables.
  737. logger.warning(
  738. "[paragraph_semantic_chunking] block under heading %r exceeds "
  739. "target_max=%d tokens (~%d tokens) but has no eligible anchor "
  740. "paragraph (≤ %d chars); preferring table row-boundary split, "
  741. "falling back to recursive-character splitting on residual "
  742. "content.",
  743. heading,
  744. target_max,
  745. total,
  746. _MAX_ANCHOR_CANDIDATE_LENGTH,
  747. )
  748. # Step 1: expand each oversized table paragraph into row-bounded
  749. # pieces; non-table or in-budget paragraphs pass through verbatim.
  750. # ``last_min`` mirrors Stage B's ratio (no separate constant — the
  751. # tail-merge threshold is purely a row-balancing heuristic).
  752. last_min = max(int(target_max * _TABLE_MIN_LAST_RATIO), 1)
  753. pieces: list[str] = []
  754. for para in paragraphs:
  755. text = para["text"]
  756. if (
  757. para.get("is_table", False)
  758. and _count_tokens(tokenizer, text) > target_max
  759. ):
  760. pieces.extend(
  761. _split_table_text(
  762. text,
  763. tokenizer=tokenizer,
  764. target_max=target_max,
  765. target_ideal=target_ideal,
  766. last_min=last_min,
  767. )
  768. )
  769. else:
  770. pieces.append(text)
  771. # Step 2: greedy-pack pieces into chunks ≤ target_max. A piece
  772. # that is itself oversized (e.g. a single dense prose paragraph
  773. # without short anchors) is character-split via
  774. # :func:`chunking_by_recursive_character` after flushing the
  775. # current buffer. The "\n" separator inserted by ``"\n".join(buf)``
  776. # also costs tokens, so it must be debited from the budget —
  777. # otherwise two pieces that sum to exactly target_max would
  778. # overflow once joined.
  779. sep_tokens = _count_tokens(tokenizer, "\n")
  780. chunks_text: list[str] = []
  781. buf: list[str] = []
  782. buf_tokens = 0
  783. for piece in pieces:
  784. piece_tokens = _count_tokens(tokenizer, piece)
  785. if piece_tokens > target_max:
  786. if buf:
  787. chunks_text.append("\n".join(buf))
  788. buf, buf_tokens = [], 0
  789. chunks_text.extend(
  790. _character_split_text(
  791. piece,
  792. tokenizer,
  793. target_max=target_max,
  794. chunk_overlap_token_size=chunk_overlap_token_size,
  795. )
  796. )
  797. continue
  798. addition = piece_tokens + (sep_tokens if buf else 0)
  799. if buf and buf_tokens + addition > target_max:
  800. chunks_text.append("\n".join(buf))
  801. buf, buf_tokens = [], 0
  802. addition = piece_tokens
  803. buf.append(piece)
  804. buf_tokens += addition
  805. if buf:
  806. chunks_text.append("\n".join(buf))
  807. if not chunks_text:
  808. # Defensive: every piece was empty after stripping. Emit the
  809. # original oversized block so the document is never silently
  810. # dropped (matches the prior behaviour of the empty-R branch).
  811. return [
  812. _new_block(
  813. heading=heading,
  814. parent_headings=parent_headings,
  815. level=level,
  816. paragraphs=paragraphs,
  817. table_chunk_role=table_chunk_role,
  818. tokenizer=tokenizer,
  819. blockids=blockids,
  820. )
  821. ]
  822. sub_blocks: list[dict[str, Any]] = []
  823. for i, chunk_text in enumerate(chunks_text):
  824. stripped = chunk_text.strip()
  825. is_still_table = stripped.startswith("<table ") and stripped.endswith(
  826. "</table>"
  827. )
  828. sub_blocks.append(
  829. _new_block(
  830. heading=heading,
  831. parent_headings=parent_headings,
  832. level=level,
  833. paragraphs=[{"text": chunk_text, "is_table": is_still_table}],
  834. # Only the first sub-block keeps the inbound
  835. # table_chunk_role; the rest are text-only by
  836. # construction (mirrors the anchor-split path below).
  837. table_chunk_role=table_chunk_role if i == 0 else "none",
  838. tokenizer=tokenizer,
  839. blockids=blockids,
  840. )
  841. )
  842. return sub_blocks
  843. # Pick the anchors closest to evenly-spaced ideal positions.
  844. pool = list(candidates)
  845. selected: list[dict[str, Any]] = []
  846. for i in range(1, target_blocks):
  847. if not pool:
  848. break
  849. ideal_position = i * target_size
  850. best = min(pool, key=lambda c: abs(c["position"] - ideal_position))
  851. selected.append(best)
  852. pool.remove(best)
  853. selected.sort(key=lambda c: c["index"])
  854. sub_blocks: list[dict[str, Any]] = []
  855. prev_idx = 0
  856. cur_heading = heading
  857. cur_parents = list(parent_headings)
  858. # Only the first sub-block keeps the inbound table_chunk_role; the
  859. # post-anchor sub-blocks are text-only by construction.
  860. cur_role = table_chunk_role
  861. for anchor in selected:
  862. split_idx = anchor["index"]
  863. slice_paras = paragraphs[prev_idx:split_idx]
  864. if slice_paras:
  865. sub_blocks.append(
  866. _new_block(
  867. heading=cur_heading,
  868. parent_headings=cur_parents,
  869. level=level,
  870. paragraphs=slice_paras,
  871. table_chunk_role=cur_role,
  872. tokenizer=tokenizer,
  873. blockids=blockids,
  874. )
  875. )
  876. # Anchor becomes the first paragraph (and heading) of the next sub-block.
  877. cur_parents = (
  878. list(parent_headings) + [heading]
  879. if heading and cur_heading == heading
  880. else list(cur_parents)
  881. )
  882. cur_heading = anchor["text"]
  883. cur_role = "none"
  884. prev_idx = split_idx
  885. tail = paragraphs[prev_idx:]
  886. if tail:
  887. sub_blocks.append(
  888. _new_block(
  889. heading=cur_heading,
  890. parent_headings=cur_parents,
  891. level=level,
  892. paragraphs=tail,
  893. table_chunk_role=cur_role,
  894. tokenizer=tokenizer,
  895. blockids=blockids,
  896. )
  897. )
  898. # Recursive guard: any sub-block still over target_max is re-split,
  899. # including single-paragraph subs — the no-anchor branch above honors
  900. # the cap via row-boundary or character-level splitting and is the
  901. # only path that can shrink them.
  902. out: list[dict[str, Any]] = []
  903. for sub in sub_blocks:
  904. if sub["tokens"] > target_max:
  905. out.extend(
  906. _split_long_block(
  907. sub["paragraphs"],
  908. sub["heading"],
  909. sub["parent_headings"],
  910. sub["level"],
  911. sub["table_chunk_role"],
  912. tokenizer=tokenizer,
  913. target_max=target_max,
  914. target_ideal=target_ideal,
  915. chunk_overlap_token_size=chunk_overlap_token_size,
  916. blockids=sub.get("blockids") or blockids,
  917. )
  918. )
  919. else:
  920. out.append(sub)
  921. return out
  922. # ---------------------------------------------------------------------------
  923. # Stage D — bottom-up, level-aware small-block merging.
  924. # ---------------------------------------------------------------------------
  925. def _can_merge_forward(role: str, *, phase: str) -> bool:
  926. if phase == "A":
  927. return role in {"none", "first"}
  928. return role in {"none", "first", "last"}
  929. def _can_merge_backward(role: str) -> bool:
  930. return role in {"none", "last"}
  931. def _merged_pair(
  932. left: dict[str, Any],
  933. right: dict[str, Any],
  934. *,
  935. keep: str,
  936. tokenizer: Tokenizer,
  937. ) -> dict[str, Any]:
  938. base = left if keep == "left" else right
  939. paragraphs = list(left["paragraphs"]) + list(right["paragraphs"])
  940. content = left["content"] + "\n\n" + right["content"]
  941. merged_blockids = _dedup_preserving_order(
  942. list(left.get("blockids") or []) + list(right.get("blockids") or [])
  943. )
  944. return {
  945. "heading": base["heading"],
  946. "parent_headings": list(base["parent_headings"]),
  947. "level": base["level"],
  948. "paragraphs": paragraphs,
  949. "content": content,
  950. "tokens": _count_tokens(tokenizer, content),
  951. "table_chunk_role": "none",
  952. "blockids": merged_blockids,
  953. }
  954. def _merge_small_blocks(
  955. blocks: list[dict[str, Any]],
  956. *,
  957. tokenizer: Tokenizer,
  958. target_max: int,
  959. target_ideal: int,
  960. small_tail_threshold: int,
  961. ) -> list[dict[str, Any]]:
  962. """Bottom-up, level-aware small-block merging.
  963. Re-implementation of
  964. :func:`lightrag.parser.docx.parse_document.merge_small_blocks`,
  965. parameterised on the chunk-size targets and operating on internal
  966. block dicts (no ``uuid`` / ``table_header`` propagation needed: the
  967. chunking output schema does not carry them).
  968. """
  969. if len(blocks) <= 1:
  970. return blocks
  971. result = list(blocks)
  972. levels = sorted({b.get("level", 1) for b in result}, reverse=True)
  973. for current_level in levels:
  974. # Phase A — same-level merging.
  975. changed = True
  976. while changed:
  977. changed = False
  978. new_result: list[dict[str, Any]] = []
  979. i = 0
  980. while i < len(result):
  981. cur = result[i]
  982. cur_tokens = cur["tokens"]
  983. cur_level = cur.get("level", 1)
  984. cur_role = cur.get("table_chunk_role", "none")
  985. below_ideal = 0 < cur_tokens < target_ideal
  986. is_cur_lv = cur_level == current_level
  987. if below_ideal and is_cur_lv:
  988. merged = False
  989. if _can_merge_forward(cur_role, phase="A") and i + 1 < len(result):
  990. nxt = result[i + 1]
  991. if nxt.get("level", 1) == current_level and _can_merge_backward(
  992. nxt.get("table_chunk_role", "none")
  993. ):
  994. combined = _merged_pair(
  995. cur, nxt, keep="left", tokenizer=tokenizer
  996. )
  997. if combined["tokens"] <= target_max:
  998. new_result.append(combined)
  999. i += 2
  1000. changed = True
  1001. merged = True
  1002. if not merged and _can_merge_backward(cur_role) and new_result:
  1003. prev = new_result[-1]
  1004. if (
  1005. prev.get("level", 1) == current_level
  1006. and _can_merge_forward(
  1007. prev.get("table_chunk_role", "none"), phase="A"
  1008. )
  1009. and prev["tokens"] < target_ideal
  1010. ):
  1011. combined = _merged_pair(
  1012. prev, cur, keep="left", tokenizer=tokenizer
  1013. )
  1014. if combined["tokens"] <= target_max:
  1015. new_result[-1] = combined
  1016. i += 1
  1017. changed = True
  1018. merged = True
  1019. if not merged:
  1020. new_result.append(cur)
  1021. i += 1
  1022. else:
  1023. # Tail absorption: an at-or-above-IDEAL block can absorb
  1024. # a short run of subsequent same-level blocks if their
  1025. # combined size stays under SMALL_TAIL_THRESHOLD and
  1026. # fits within target_max — eliminates the document's
  1027. # trailing sliver of zero-content remainders.
  1028. if is_cur_lv and cur_tokens >= target_ideal:
  1029. tail_total = 0
  1030. end_idx = i + 1
  1031. for j in range(i + 1, len(result)):
  1032. nxt = result[j]
  1033. if nxt.get("level", 1) != current_level:
  1034. break
  1035. if nxt.get("table_chunk_role", "none") == "middle":
  1036. break
  1037. tail_total += nxt["tokens"]
  1038. end_idx = j + 1
  1039. if (
  1040. tail_total > 0
  1041. and tail_total < small_tail_threshold
  1042. and cur_tokens + tail_total <= target_max
  1043. ):
  1044. absorbed_paragraphs = list(cur["paragraphs"])
  1045. absorbed_content = cur["content"]
  1046. for j in range(i + 1, end_idx):
  1047. nxt = result[j]
  1048. absorbed_paragraphs.extend(nxt["paragraphs"])
  1049. absorbed_content += "\n\n" + nxt["content"]
  1050. # The cheap predicate above sums per-block
  1051. # tokens, but absorption joins blocks with
  1052. # ``"\n\n"`` — those separator tokens are
  1053. # real and can push the merged block over
  1054. # target_max. Re-measure the joined content
  1055. # before committing to absorb.
  1056. absorbed_tokens = _count_tokens(tokenizer, absorbed_content)
  1057. if absorbed_tokens <= target_max:
  1058. new_result.append(
  1059. {
  1060. "heading": cur["heading"],
  1061. "parent_headings": list(cur["parent_headings"]),
  1062. "level": cur["level"],
  1063. "paragraphs": absorbed_paragraphs,
  1064. "content": absorbed_content,
  1065. "tokens": absorbed_tokens,
  1066. "table_chunk_role": "none",
  1067. }
  1068. )
  1069. i = end_idx
  1070. changed = True
  1071. continue
  1072. new_result.append(cur)
  1073. i += 1
  1074. result = new_result
  1075. # Phase B — cross-level absorption (shallower absorbs deeper).
  1076. changed = True
  1077. while changed:
  1078. changed = False
  1079. new_result = []
  1080. i = 0
  1081. while i < len(result):
  1082. cur = result[i]
  1083. cur_tokens = cur["tokens"]
  1084. cur_level = cur.get("level", 1)
  1085. cur_role = cur.get("table_chunk_role", "none")
  1086. below_ideal = 0 < cur_tokens < target_ideal
  1087. is_cur_lv = cur_level == current_level
  1088. if below_ideal and is_cur_lv:
  1089. merged = False
  1090. if _can_merge_forward(cur_role, phase="B") and i + 1 < len(result):
  1091. nxt = result[i + 1]
  1092. if nxt.get("level", 1) > current_level and _can_merge_backward(
  1093. nxt.get("table_chunk_role", "none")
  1094. ):
  1095. combined = _merged_pair(
  1096. cur, nxt, keep="left", tokenizer=tokenizer
  1097. )
  1098. if combined["tokens"] <= target_max:
  1099. new_result.append(combined)
  1100. i += 2
  1101. changed = True
  1102. merged = True
  1103. if not merged and _can_merge_backward(cur_role) and new_result:
  1104. prev = new_result[-1]
  1105. if (
  1106. prev.get("level", 1) < current_level
  1107. and _can_merge_forward(
  1108. prev.get("table_chunk_role", "none"), phase="B"
  1109. )
  1110. and prev["tokens"] < target_ideal
  1111. ):
  1112. combined = _merged_pair(
  1113. prev, cur, keep="left", tokenizer=tokenizer
  1114. )
  1115. if combined["tokens"] <= target_max:
  1116. new_result[-1] = combined
  1117. i += 1
  1118. changed = True
  1119. merged = True
  1120. if not merged:
  1121. new_result.append(cur)
  1122. i += 1
  1123. else:
  1124. new_result.append(cur)
  1125. i += 1
  1126. result = new_result
  1127. return result
  1128. # ---------------------------------------------------------------------------
  1129. # Public entrypoint.
  1130. # ---------------------------------------------------------------------------
  1131. def chunking_by_paragraph_semantic(
  1132. tokenizer: Tokenizer,
  1133. content: str,
  1134. chunk_token_size: int = 2000,
  1135. *,
  1136. blocks_path: str | None = None,
  1137. chunk_overlap_token_size: int = 100,
  1138. ) -> list[dict[str, Any]]:
  1139. """Paragraph Semantic Chunking — the ``chunking="P"`` strategy.
  1140. Reads structured blocks emitted by the docx native parser at
  1141. ``fixlevel=0`` (Stage A, persisted to ``blocks.jsonl``) and applies
  1142. Stage B (table re-split + glue), Stage C (anchor-driven long-block
  1143. re-split) and Stage D (bottom-up, level-aware merging). Output rows
  1144. match the schema produced by
  1145. :func:`lightrag.chunker.chunking_by_token_size`
  1146. (``tokens``/``content``/``chunk_order_index``), enriched with
  1147. ``heading``, ``parent_headings`` and ``level`` so KG extraction can
  1148. leverage the document hierarchy.
  1149. Signature follows the LightRAG chunker contract — the standard
  1150. prefix ``(tokenizer, content, chunk_token_size)`` is shared with
  1151. every other chunker, while strategy-specific knobs are keyword-only:
  1152. - ``blocks_path`` (this strategy's required input — the
  1153. ``.blocks.jsonl`` sidecar produced at parse time)
  1154. Knobs that ``chunking_by_token_size`` exposes for delimiter-based
  1155. splitting (``split_by_character``, ``split_by_character_only``) are
  1156. deliberately absent here because paragraph-semantic chunks are
  1157. heading-aligned. ``chunk_overlap_token_size`` is supported for two
  1158. paragraph-semantic cases where overlap preserves meaning inside one
  1159. JSONL content row: recursive-character fallback for long prose, and
  1160. bridge text duplicated around adjacent oversized table boundary chunks.
  1161. When one original ``blocks.jsonl`` content row is split into multiple
  1162. fragments, every fragment heading receives a row-local ``[part n]``
  1163. suffix; unsplit rows keep their original heading.
  1164. Args:
  1165. tokenizer: LightRAG tokenizer (used for all token counting; matches
  1166. the unit used by ``chunk_token_size``).
  1167. content: Merged plain-text content of the document. Used as the
  1168. fallback corpus when ``blocks_path`` is missing or unreadable
  1169. so the pipeline never silently drops a document.
  1170. chunk_token_size: Hard upper bound for each chunk in tokens. The
  1171. ideal target is set at 75 % of this value (mirroring the
  1172. audit-mode 6000/8000 ratio); see threshold ratio constants
  1173. above for the full mapping.
  1174. blocks_path: Path to the document's ``.blocks.jsonl`` sidecar
  1175. (typically ``parsed_data["blocks_path"]``). When ``None``,
  1176. unreadable, or empty, this function falls back to
  1177. :func:`chunking_by_recursive_character` on ``content``
  1178. (per ``docs/FileProcessingConfiguration-zh.md`` line 120 / 146).
  1179. That fallback hard-requires ``langchain-text-splitters``;
  1180. an :class:`ImportError` is surfaced rather than silently
  1181. degrading further.
  1182. chunk_overlap_token_size: Token overlap used only when P must
  1183. fall back to recursive-character splitting of ordinary text,
  1184. and as the per-side budget for duplicating text between two
  1185. adjacent oversized table chunks. Structural table row splits
  1186. remain row-bounded and non-overlapping.
  1187. Returns:
  1188. Ordered list of chunk dicts, each shaped:
  1189. ``{"tokens", "content", "chunk_order_index", "heading",
  1190. "parent_headings", "level"}``.
  1191. Notes:
  1192. blocks.jsonl field analysis vs. algorithm requirements:
  1193. - ``content`` (``\\n``-joined per ``_build_unsplit_block``) →
  1194. split back into per-paragraph text via ``split("\\n")``;
  1195. lossless because table/equation/drawing tags are emitted as
  1196. single-line replacements.
  1197. - ``heading`` / ``parent_headings`` / ``level`` → consumed
  1198. directly by Stage C/D for hierarchy-aware merging. If one
  1199. original content row produces multiple fragments, the current
  1200. ``heading`` receives a ``[part n]`` suffix after Stage B/C and
  1201. before Stage D. ``parent_headings`` remain unchanged.
  1202. - ``<table id="…" format="json">{rows_json}</table>`` tags →
  1203. JSON body parsed in Stage B for row-level re-split when the
  1204. tag exceeds the per-table token cap. When two split tables
  1205. have short text between them, that text may be repeated in
  1206. both table boundary chunks; longer bridge text leaves any
  1207. middle remainder as a separate text block.
  1208. - ``<equation>`` / ``<drawing>`` tags → treated as atomic
  1209. non-table paragraphs — neither splittable nor anchorable.
  1210. - Per-paragraph paraIds are NOT preserved in blocks.jsonl
  1211. (only block-level ``positions[].range`` is). Acceptable
  1212. because the chunking output schema does not require them.
  1213. - ``table_slice`` is always ``"none"`` in blocks.jsonl
  1214. (parse-time ``fixlevel=0`` keeps tables whole), so any
  1215. ``table_chunk_role`` consumed by Stage D is recomputed
  1216. on-the-fly inside Stage B.
  1217. """
  1218. target_max = max(int(chunk_token_size), 1)
  1219. target_ideal = max(int(target_max * _IDEAL_RATIO), 1)
  1220. table_max = max(int(target_max * _TABLE_MAX_RATIO), 1)
  1221. table_ideal = max(int(target_max * _TABLE_IDEAL_RATIO), 1)
  1222. table_min_last = max(int(table_max * _TABLE_MIN_LAST_RATIO), 1)
  1223. small_tail_threshold = max(int(target_max * _SMALL_TAIL_RATIO), 1)
  1224. overlap = _bounded_overlap(target_max, chunk_overlap_token_size)
  1225. rows: list[dict[str, Any]] = []
  1226. fallback_reason: str | None = None
  1227. if not blocks_path:
  1228. fallback_reason = "blocks_path is empty"
  1229. else:
  1230. try:
  1231. rows = _load_blocks_from_jsonl(blocks_path)
  1232. except OSError as exc:
  1233. fallback_reason = f"cannot read blocks.jsonl at {blocks_path}: {exc}"
  1234. else:
  1235. if not rows:
  1236. fallback_reason = (
  1237. f"blocks.jsonl at {blocks_path} contains no content rows"
  1238. )
  1239. if fallback_reason is not None:
  1240. # Defer to recursive-character chunking when the sidecar is
  1241. # absent — ensures non-docx documents and edge-case parses still
  1242. # produce chunks instead of silently dropping content. Document
  1243. # contract (FileProcessingConfiguration-zh.md L120 / L146) is
  1244. # explicit that P falls back to R; that contract requires
  1245. # langchain-text-splitters to be installed, so an ImportError
  1246. # here is intentional rather than a silent degrade to F. Lazy
  1247. # import dodges the recursive_character ↔ paragraph_semantic
  1248. # circular dependency.
  1249. logger.warning(
  1250. "[paragraph_semantic_chunking] %s; falling back to "
  1251. "recursive-character chunking with chunk_token_size=%d.",
  1252. fallback_reason,
  1253. target_max,
  1254. )
  1255. from lightrag.chunker.recursive_character import (
  1256. chunking_by_recursive_character,
  1257. )
  1258. return chunking_by_recursive_character(
  1259. tokenizer,
  1260. content,
  1261. target_max,
  1262. chunk_overlap_token_size=overlap,
  1263. )
  1264. # Build initial blocks (Stage A output, already persisted).
  1265. initial: list[dict[str, Any]] = []
  1266. for row in rows:
  1267. text = row.get("content", "") or ""
  1268. if not text.strip():
  1269. continue
  1270. paragraphs = _block_to_paragraphs(text)
  1271. if not paragraphs:
  1272. continue
  1273. row_blockid = str(row.get("blockid") or "").strip()
  1274. initial.append(
  1275. _new_block(
  1276. heading=row.get("heading", "") or "",
  1277. parent_headings=list(row.get("parent_headings") or []),
  1278. level=int(row.get("level", 1) or 1),
  1279. paragraphs=paragraphs,
  1280. table_chunk_role="none",
  1281. tokenizer=tokenizer,
  1282. blockids=[row_blockid] if row_blockid else None,
  1283. )
  1284. )
  1285. # Stage B/C are run per original blocks.jsonl content row so split
  1286. # fragments can be labelled with [part n] using a row-local counter
  1287. # before Stage D merges small neighbours.
  1288. after_c: list[dict[str, Any]] = []
  1289. for blk in initial:
  1290. block_after_b = _expand_block_with_table_splits(
  1291. blk,
  1292. tokenizer=tokenizer,
  1293. table_max=table_max,
  1294. table_ideal=table_ideal,
  1295. table_min_last=table_min_last,
  1296. target_max=target_max,
  1297. chunk_overlap_token_size=overlap,
  1298. )
  1299. block_after_c: list[dict[str, Any]] = []
  1300. for split_blk in block_after_b:
  1301. block_after_c.extend(
  1302. _split_long_block(
  1303. split_blk["paragraphs"],
  1304. split_blk["heading"],
  1305. split_blk["parent_headings"],
  1306. split_blk["level"],
  1307. split_blk.get("table_chunk_role", "none"),
  1308. tokenizer=tokenizer,
  1309. target_max=target_max,
  1310. target_ideal=target_ideal,
  1311. chunk_overlap_token_size=overlap,
  1312. blockids=split_blk.get("blockids") or blk.get("blockids"),
  1313. )
  1314. )
  1315. after_c.extend(_apply_part_suffixes(block_after_c))
  1316. # Stage D — bottom-up, level-aware small-block merging.
  1317. final = _merge_small_blocks(
  1318. after_c,
  1319. tokenizer=tokenizer,
  1320. target_max=target_max,
  1321. target_ideal=target_ideal,
  1322. small_tail_threshold=small_tail_threshold,
  1323. )
  1324. # Convert internal block dicts to the new chunk schema: nested heading
  1325. # dict + sidecar block carrying source blockid refs so the multimodal
  1326. # pipeline (and document-delete cache cleanup) can trace each chunk
  1327. # back to its blocks.jsonl row(s).
  1328. chunks: list[dict[str, Any]] = []
  1329. for idx, blk in enumerate(final):
  1330. body = blk["content"].strip()
  1331. if not body:
  1332. continue
  1333. chunk_dict: dict[str, Any] = {
  1334. "tokens": blk["tokens"],
  1335. "content": body,
  1336. "chunk_order_index": idx,
  1337. "heading": {
  1338. "level": int(blk.get("level") or 0),
  1339. "heading": str(blk.get("heading") or ""),
  1340. "parent_headings": list(blk.get("parent_headings") or []),
  1341. },
  1342. }
  1343. blockids = blk.get("blockids") or []
  1344. if blockids:
  1345. chunk_dict["sidecar"] = {
  1346. "type": "block",
  1347. "id": blockids[0],
  1348. "refs": [{"type": "block", "id": bid} for bid in blockids],
  1349. }
  1350. chunks.append(chunk_dict)
  1351. return chunks