test_paragraph_semantic_split_long_block.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. """Regression tests for paragraph-semantic Stage C anchor selection."""
  2. import json
  3. import pytest
  4. from lightrag.chunker.paragraph_semantic import (
  5. _split_long_block,
  6. chunking_by_paragraph_semantic,
  7. )
  8. from lightrag.utils import Tokenizer, TokenizerInterface
  9. class _CharTokenizer(TokenizerInterface):
  10. """1:1 character-to-token mapping — keeps math obvious in assertions."""
  11. def encode(self, content: str):
  12. return [ord(ch) for ch in content]
  13. def decode(self, tokens):
  14. return "".join(chr(t) for t in tokens)
  15. def _make_tokenizer() -> Tokenizer:
  16. return Tokenizer(model_name="char", tokenizer=_CharTokenizer())
  17. def _write_blocks_jsonl(tmp_path, rows: list[dict]) -> str:
  18. path = tmp_path / "doc.blocks.jsonl"
  19. path.write_text(
  20. "\n".join(json.dumps(row, ensure_ascii=False) for row in rows),
  21. encoding="utf-8",
  22. )
  23. return str(path)
  24. @pytest.mark.offline
  25. def test_split_long_block_short_lead_then_huge_does_not_recurse():
  26. # Reproduces the case where the only ≤100-char paragraph is at index 0:
  27. # before the fix, the anchor at idx=0 was selected, slice_paras was
  28. # empty, the tail was the original input, and the recursive guard
  29. # re-entered _split_long_block with the same arguments forever.
  30. tokenizer = _make_tokenizer()
  31. paragraphs = [
  32. {"text": "Short lead anchor."}, # idx 0 — short, but unusable as a divider
  33. {"text": "x" * 4000}, # idx 1 — huge, no anchor inside
  34. ]
  35. blocks = _split_long_block(
  36. paragraphs,
  37. heading="Heading",
  38. parent_headings=[],
  39. level=2,
  40. table_chunk_role="none",
  41. tokenizer=tokenizer,
  42. target_max=1000,
  43. target_ideal=750,
  44. )
  45. # Falls through to the "no eligible anchor" branch and now defers to
  46. # recursive-character splitting so ``target_max`` is honored without
  47. # relying on the embedding-time hard fallback (which uses a different
  48. # threshold). The original recursion-guard contract still holds: the
  49. # function returns a finite list rather than recursing forever.
  50. assert len(blocks) > 1
  51. assert all(b["tokens"] <= 1000 for b in blocks)
  52. # Heading hierarchy is preserved on every R-derived sub-block.
  53. assert all(b["heading"] == "Heading" for b in blocks)
  54. @pytest.mark.offline
  55. def test_split_long_block_no_anchor_pack_accounts_for_separator():
  56. # The no-anchor greedy pack joins pieces with ``"\n"``, which costs
  57. # tokens on its own. Without debiting that separator from the buffer
  58. # budget, two pieces summing to exactly target_max produced a final
  59. # chunk of ``target_max + 1`` tokens — silently violating the cap.
  60. tokenizer = _make_tokenizer()
  61. # Two paragraphs both > _MAX_ANCHOR_CANDIDATE_LENGTH (100 chars), so
  62. # neither qualifies as an anchor and the no-anchor branch fires.
  63. # Their lengths sum exactly to ``target_max`` (101 + 101 = 202),
  64. # so before the fix the joined output overflowed by the "\n" token.
  65. paragraphs = [
  66. {"text": "a" * 101},
  67. {"text": "b" * 101},
  68. ]
  69. blocks = _split_long_block(
  70. paragraphs,
  71. heading="Heading",
  72. parent_headings=[],
  73. level=2,
  74. table_chunk_role="none",
  75. tokenizer=tokenizer,
  76. target_max=202,
  77. target_ideal=150,
  78. )
  79. assert blocks, "expected at least one sub-block"
  80. assert all(b["tokens"] <= 202 for b in blocks), [b["tokens"] for b in blocks]
  81. @pytest.mark.offline
  82. def test_split_long_block_single_paragraph_oversized_is_character_split():
  83. # A single oversized paragraph used to trigger the early-return at
  84. # ``len(paragraphs) <= 1`` and the recursive-guard's ``> 1`` clause,
  85. # so the function emitted one ~total-token block that silently
  86. # blew past target_max. With both gates relaxed, the no-anchor
  87. # branch's character fallback honors the cap on this case too.
  88. tokenizer = _make_tokenizer()
  89. paragraphs = [{"text": "x" * 4000}]
  90. blocks = _split_long_block(
  91. paragraphs,
  92. heading="Heading",
  93. parent_headings=[],
  94. level=2,
  95. table_chunk_role="none",
  96. tokenizer=tokenizer,
  97. target_max=1000,
  98. target_ideal=750,
  99. )
  100. assert len(blocks) > 1, "single oversized paragraph must be split, not kept whole"
  101. assert all(b["tokens"] <= 1000 for b in blocks), [b["tokens"] for b in blocks]
  102. # Heading hierarchy is preserved on every R-derived sub-block.
  103. assert all(b["heading"] == "Heading" for b in blocks)
  104. @pytest.mark.offline
  105. def test_split_long_block_character_fallback_keeps_configured_overlap(monkeypatch):
  106. tokenizer = _make_tokenizer()
  107. captured: dict[str, int] = {}
  108. def fake_chunker(
  109. tokenizer,
  110. content,
  111. chunk_token_size: int = 1200,
  112. *,
  113. chunk_overlap_token_size: int = 100,
  114. separators=None,
  115. ):
  116. captured["chunk_overlap_token_size"] = chunk_overlap_token_size
  117. step = max(chunk_token_size - chunk_overlap_token_size, 1)
  118. tokens = tokenizer.encode(content)
  119. chunks = []
  120. for start in range(0, len(tokens), step):
  121. piece = tokenizer.decode(tokens[start : start + chunk_token_size])
  122. chunks.append(
  123. {
  124. "tokens": len(tokenizer.encode(piece)),
  125. "content": piece,
  126. "chunk_order_index": len(chunks),
  127. }
  128. )
  129. return chunks
  130. import lightrag.chunker.recursive_character as rc_mod
  131. monkeypatch.setattr(rc_mod, "chunking_by_recursive_character", fake_chunker)
  132. blocks = _split_long_block(
  133. [{"text": "x" * 260}],
  134. heading="Heading",
  135. parent_headings=[],
  136. level=2,
  137. table_chunk_role="none",
  138. tokenizer=tokenizer,
  139. target_max=100,
  140. target_ideal=75,
  141. chunk_overlap_token_size=25,
  142. )
  143. assert captured["chunk_overlap_token_size"] == 25
  144. assert len(blocks) > 1
  145. assert blocks[0]["content"][-25:] == blocks[1]["content"][:25]
  146. @pytest.mark.offline
  147. def test_split_long_block_uses_later_short_anchor():
  148. # Sanity check: a short paragraph at idx>0 IS still a valid divider.
  149. tokenizer = _make_tokenizer()
  150. paragraphs = [
  151. {"text": "x" * 1500}, # idx 0 — huge
  152. {"text": "Mid anchor."}, # idx 1 — short, eligible
  153. {"text": "y" * 1500}, # idx 2 — huge
  154. ]
  155. blocks = _split_long_block(
  156. paragraphs,
  157. heading="Heading",
  158. parent_headings=[],
  159. level=2,
  160. table_chunk_role="none",
  161. tokenizer=tokenizer,
  162. target_max=1000,
  163. target_ideal=750,
  164. )
  165. assert len(blocks) >= 2
  166. # Anchor paragraph becomes the heading of the post-split sub-block.
  167. assert any(b["heading"] == "Mid anchor." for b in blocks)
  168. @pytest.mark.offline
  169. def test_public_chunking_keeps_unsplit_heading_without_part_suffix(tmp_path):
  170. tokenizer = _make_tokenizer()
  171. blocks_path = _write_blocks_jsonl(
  172. tmp_path,
  173. [
  174. {
  175. "type": "content",
  176. "heading": "Heading",
  177. "parent_headings": [],
  178. "level": 2,
  179. "content": "short body",
  180. }
  181. ],
  182. )
  183. chunks = chunking_by_paragraph_semantic(
  184. tokenizer,
  185. "short body",
  186. chunk_token_size=100,
  187. blocks_path=blocks_path,
  188. )
  189. assert len(chunks) == 1
  190. assert chunks[0]["heading"]["heading"] == "Heading"
  191. @pytest.mark.offline
  192. def test_public_chunking_adds_part_suffixes_for_anchor_split(tmp_path):
  193. tokenizer = _make_tokenizer()
  194. body = "\n".join(["x" * 800, "Mid anchor.", "y" * 800])
  195. blocks_path = _write_blocks_jsonl(
  196. tmp_path,
  197. [
  198. {
  199. "type": "content",
  200. "heading": "Heading",
  201. "parent_headings": [],
  202. "level": 2,
  203. "content": body,
  204. }
  205. ],
  206. )
  207. chunks = chunking_by_paragraph_semantic(
  208. tokenizer,
  209. body,
  210. chunk_token_size=1000,
  211. blocks_path=blocks_path,
  212. chunk_overlap_token_size=0,
  213. )
  214. assert [chunk["heading"]["heading"] for chunk in chunks] == [
  215. "Heading [part 1]",
  216. "Mid anchor. [part 2]",
  217. ]
  218. assert all(
  219. all("[part " not in parent for parent in chunk["heading"]["parent_headings"])
  220. for chunk in chunks
  221. )
  222. @pytest.mark.offline
  223. def test_public_chunking_adds_part_suffixes_for_long_text_fallback(
  224. tmp_path, monkeypatch
  225. ):
  226. tokenizer = _make_tokenizer()
  227. def fake_chunker(
  228. tokenizer,
  229. content,
  230. chunk_token_size: int = 1200,
  231. *,
  232. chunk_overlap_token_size: int = 100,
  233. separators=None,
  234. ):
  235. tokens = tokenizer.encode(content)
  236. chunks = []
  237. for start in range(0, len(tokens), chunk_token_size):
  238. piece = tokenizer.decode(tokens[start : start + chunk_token_size])
  239. chunks.append(
  240. {
  241. "tokens": len(tokenizer.encode(piece)),
  242. "content": piece,
  243. "chunk_order_index": len(chunks),
  244. }
  245. )
  246. return chunks
  247. import lightrag.chunker.recursive_character as rc_mod
  248. monkeypatch.setattr(rc_mod, "chunking_by_recursive_character", fake_chunker)
  249. body = "z" * 260
  250. blocks_path = _write_blocks_jsonl(
  251. tmp_path,
  252. [
  253. {
  254. "type": "content",
  255. "heading": "Heading",
  256. "parent_headings": [],
  257. "level": 2,
  258. "content": body,
  259. }
  260. ],
  261. )
  262. chunks = chunking_by_paragraph_semantic(
  263. tokenizer,
  264. body,
  265. chunk_token_size=100,
  266. blocks_path=blocks_path,
  267. chunk_overlap_token_size=0,
  268. )
  269. assert [chunk["heading"]["heading"] for chunk in chunks] == [
  270. "Heading [part 1]",
  271. "Heading [part 2]",
  272. "Heading [part 3]",
  273. ]