test_chunking.py 31 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066
  1. import pytest
  2. from lightrag.exceptions import ChunkTokenLimitExceededError
  3. from lightrag.chunker import chunking_by_token_size
  4. from lightrag.utils import Tokenizer, TokenizerInterface
  5. class DummyTokenizer(TokenizerInterface):
  6. """Simple 1:1 character-to-token mapping."""
  7. def encode(self, content: str):
  8. return [ord(ch) for ch in content]
  9. def decode(self, tokens):
  10. return "".join(chr(token) for token in tokens)
  11. class MultiTokenCharacterTokenizer(TokenizerInterface):
  12. """
  13. Tokenizer where character-to-token ratio is non-uniform.
  14. This helps catch bugs where code incorrectly counts characters instead of tokens.
  15. Mapping:
  16. - Uppercase letters: 2 tokens each
  17. - Punctuation (!, ?, .): 3 tokens each
  18. - Other characters: 1 token each
  19. """
  20. def encode(self, content: str):
  21. tokens = []
  22. for ch in content:
  23. if ch.isupper(): # Uppercase = 2 tokens
  24. tokens.extend([ord(ch), ord(ch) + 1000])
  25. elif ch in ["!", "?", "."]: # Punctuation = 3 tokens
  26. tokens.extend([ord(ch), ord(ch) + 2000, ord(ch) + 3000])
  27. else: # Regular chars = 1 token
  28. tokens.append(ord(ch))
  29. return tokens
  30. def decode(self, tokens):
  31. # Simplified decode for testing
  32. result = []
  33. i = 0
  34. while i < len(tokens):
  35. base_token = tokens[i]
  36. # Check if this is part of a multi-token sequence
  37. if (
  38. i + 2 < len(tokens)
  39. and tokens[i + 1] == base_token + 2000
  40. and tokens[i + 2] == base_token + 3000
  41. ):
  42. # 3-token punctuation
  43. result.append(chr(base_token))
  44. i += 3
  45. elif i + 1 < len(tokens) and tokens[i + 1] == base_token + 1000:
  46. # 2-token uppercase
  47. result.append(chr(base_token))
  48. i += 2
  49. else:
  50. # Single token
  51. result.append(chr(base_token))
  52. i += 1
  53. return "".join(result)
  54. def make_tokenizer() -> Tokenizer:
  55. return Tokenizer(model_name="dummy", tokenizer=DummyTokenizer())
  56. def make_multi_token_tokenizer() -> Tokenizer:
  57. return Tokenizer(model_name="multi", tokenizer=MultiTokenCharacterTokenizer())
  58. # ============================================================================
  59. # Tests for split_by_character_only=True (raises error on oversized chunks)
  60. # ============================================================================
  61. @pytest.mark.offline
  62. def test_split_by_character_only_within_limit():
  63. """Test chunking when all chunks are within token limit."""
  64. tokenizer = make_tokenizer()
  65. chunks = chunking_by_token_size(
  66. tokenizer,
  67. "alpha\n\nbeta",
  68. split_by_character="\n\n",
  69. split_by_character_only=True,
  70. chunk_token_size=10,
  71. )
  72. assert [chunk["content"] for chunk in chunks] == ["alpha", "beta"]
  73. @pytest.mark.offline
  74. def test_split_by_character_only_exceeding_limit_raises():
  75. """Test that oversized chunks raise ChunkTokenLimitExceededError."""
  76. tokenizer = make_tokenizer()
  77. oversized = "a" * 12
  78. with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
  79. chunking_by_token_size(
  80. tokenizer,
  81. oversized,
  82. split_by_character="\n\n",
  83. split_by_character_only=True,
  84. chunk_token_size=5,
  85. )
  86. err = excinfo.value
  87. assert err.chunk_tokens == len(oversized)
  88. assert err.chunk_token_limit == 5
  89. @pytest.mark.offline
  90. def test_chunk_error_includes_preview():
  91. """Test that error message includes chunk preview."""
  92. tokenizer = make_tokenizer()
  93. oversized = "x" * 100
  94. with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
  95. chunking_by_token_size(
  96. tokenizer,
  97. oversized,
  98. split_by_character="\n\n",
  99. split_by_character_only=True,
  100. chunk_token_size=10,
  101. )
  102. err = excinfo.value
  103. # Preview should be first 80 chars of a 100-char string
  104. assert err.chunk_preview == "x" * 80
  105. assert "Preview:" in str(err)
  106. @pytest.mark.offline
  107. def test_split_by_character_only_at_exact_limit():
  108. """Test chunking when chunk is exactly at token limit."""
  109. tokenizer = make_tokenizer()
  110. exact_size = "a" * 10
  111. chunks = chunking_by_token_size(
  112. tokenizer,
  113. exact_size,
  114. split_by_character="\n\n",
  115. split_by_character_only=True,
  116. chunk_token_size=10,
  117. )
  118. assert len(chunks) == 1
  119. assert chunks[0]["content"] == exact_size
  120. assert chunks[0]["tokens"] == 10
  121. @pytest.mark.offline
  122. def test_split_by_character_only_one_over_limit():
  123. """Test that chunk with one token over limit raises error."""
  124. tokenizer = make_tokenizer()
  125. one_over = "a" * 11
  126. with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
  127. chunking_by_token_size(
  128. tokenizer,
  129. one_over,
  130. split_by_character="\n\n",
  131. split_by_character_only=True,
  132. chunk_token_size=10,
  133. )
  134. err = excinfo.value
  135. assert err.chunk_tokens == 11
  136. assert err.chunk_token_limit == 10
  137. # ============================================================================
  138. # Tests for split_by_character_only=False (recursive splitting)
  139. # ============================================================================
  140. @pytest.mark.offline
  141. def test_split_recursive_oversized_chunk():
  142. """Test recursive splitting of oversized chunk with split_by_character_only=False."""
  143. tokenizer = make_tokenizer()
  144. # 30 chars - should split into chunks of size 10
  145. oversized = "a" * 30
  146. chunks = chunking_by_token_size(
  147. tokenizer,
  148. oversized,
  149. split_by_character="\n\n",
  150. split_by_character_only=False,
  151. chunk_token_size=10,
  152. chunk_overlap_token_size=0,
  153. )
  154. # Should create 3 chunks of 10 tokens each
  155. assert len(chunks) == 3
  156. assert all(chunk["tokens"] == 10 for chunk in chunks)
  157. assert all(chunk["content"] == "a" * 10 for chunk in chunks)
  158. @pytest.mark.offline
  159. def test_split_with_chunk_overlap():
  160. """
  161. Test chunk splitting with overlap using distinctive content.
  162. With distinctive characters, we can verify overlap positions are exact.
  163. Misaligned overlap would produce wrong content and fail the test.
  164. """
  165. tokenizer = make_tokenizer()
  166. # Each character is unique - enables exact position verification
  167. content = "0123456789abcdefghijklmno" # 25 chars
  168. chunks = chunking_by_token_size(
  169. tokenizer,
  170. content,
  171. split_by_character="\n\n",
  172. split_by_character_only=False,
  173. chunk_token_size=10,
  174. chunk_overlap_token_size=3,
  175. )
  176. # With overlap=3, step size = chunk_size - overlap = 10 - 3 = 7
  177. # Chunks start at positions: 0, 7, 14, 21
  178. assert len(chunks) == 4
  179. # Verify exact content and token counts
  180. assert chunks[0]["tokens"] == 10
  181. assert chunks[0]["content"] == "0123456789" # [0:10]
  182. assert chunks[1]["tokens"] == 10
  183. assert chunks[1]["content"] == "789abcdefg" # [7:17] - overlaps with "789"
  184. assert chunks[2]["tokens"] == 10
  185. assert chunks[2]["content"] == "efghijklmn" # [14:24] - overlaps with "efg"
  186. assert chunks[3]["tokens"] == 4
  187. assert chunks[3]["content"] == "lmno" # [21:25] - overlaps with "lmn"
  188. @pytest.mark.offline
  189. def test_split_multiple_chunks_with_mixed_sizes():
  190. """Test splitting text with multiple chunks of different sizes."""
  191. tokenizer = make_tokenizer()
  192. # "small\n\nlarge_chunk_here\n\nmedium"
  193. # small: 5 tokens, large_chunk_here: 16 tokens, medium: 6 tokens
  194. content = "small\n\n" + "a" * 16 + "\n\nmedium"
  195. chunks = chunking_by_token_size(
  196. tokenizer,
  197. content,
  198. split_by_character="\n\n",
  199. split_by_character_only=False,
  200. chunk_token_size=10,
  201. chunk_overlap_token_size=2,
  202. )
  203. # First chunk "small" should be kept as is (5 tokens)
  204. # Second chunk (16 tokens) should be split into 2 chunks
  205. # Third chunk "medium" should be kept as is (6 tokens)
  206. assert len(chunks) == 4
  207. assert chunks[0]["content"] == "small"
  208. assert chunks[0]["tokens"] == 5
  209. @pytest.mark.offline
  210. def test_split_exact_boundary():
  211. """Test splitting at exact chunk boundaries."""
  212. tokenizer = make_tokenizer()
  213. # Exactly 20 chars, should split into 2 chunks of 10
  214. content = "a" * 20
  215. chunks = chunking_by_token_size(
  216. tokenizer,
  217. content,
  218. split_by_character="\n\n",
  219. split_by_character_only=False,
  220. chunk_token_size=10,
  221. chunk_overlap_token_size=0,
  222. )
  223. assert len(chunks) == 2
  224. assert chunks[0]["tokens"] == 10
  225. assert chunks[1]["tokens"] == 10
  226. @pytest.mark.offline
  227. def test_split_very_large_text():
  228. """Test splitting very large text into multiple chunks."""
  229. tokenizer = make_tokenizer()
  230. # 100 chars should create 10 chunks with chunk_size=10, overlap=0
  231. content = "a" * 100
  232. chunks = chunking_by_token_size(
  233. tokenizer,
  234. content,
  235. split_by_character="\n\n",
  236. split_by_character_only=False,
  237. chunk_token_size=10,
  238. chunk_overlap_token_size=0,
  239. )
  240. assert len(chunks) == 10
  241. assert all(chunk["tokens"] == 10 for chunk in chunks)
  242. # ============================================================================
  243. # Edge Cases
  244. # ============================================================================
  245. @pytest.mark.offline
  246. def test_empty_content():
  247. """Test chunking with empty content."""
  248. tokenizer = make_tokenizer()
  249. chunks = chunking_by_token_size(
  250. tokenizer,
  251. "",
  252. split_by_character="\n\n",
  253. split_by_character_only=True,
  254. chunk_token_size=10,
  255. )
  256. assert len(chunks) == 1
  257. assert chunks[0]["content"] == ""
  258. assert chunks[0]["tokens"] == 0
  259. @pytest.mark.offline
  260. def test_single_character():
  261. """Test chunking with single character."""
  262. tokenizer = make_tokenizer()
  263. chunks = chunking_by_token_size(
  264. tokenizer,
  265. "a",
  266. split_by_character="\n\n",
  267. split_by_character_only=True,
  268. chunk_token_size=10,
  269. )
  270. assert len(chunks) == 1
  271. assert chunks[0]["content"] == "a"
  272. assert chunks[0]["tokens"] == 1
  273. @pytest.mark.offline
  274. def test_no_delimiter_in_content():
  275. """Test chunking when content has no delimiter."""
  276. tokenizer = make_tokenizer()
  277. content = "a" * 30
  278. chunks = chunking_by_token_size(
  279. tokenizer,
  280. content,
  281. split_by_character="\n\n", # Delimiter not in content
  282. split_by_character_only=False,
  283. chunk_token_size=10,
  284. chunk_overlap_token_size=0,
  285. )
  286. # Should still split based on token size
  287. assert len(chunks) == 3
  288. assert all(chunk["tokens"] == 10 for chunk in chunks)
  289. @pytest.mark.offline
  290. def test_no_split_character():
  291. """Test chunking without split_by_character (None)."""
  292. tokenizer = make_tokenizer()
  293. content = "a" * 30
  294. chunks = chunking_by_token_size(
  295. tokenizer,
  296. content,
  297. split_by_character=None,
  298. split_by_character_only=False,
  299. chunk_token_size=10,
  300. chunk_overlap_token_size=0,
  301. )
  302. # Should split based purely on token size
  303. assert len(chunks) == 3
  304. assert all(chunk["tokens"] == 10 for chunk in chunks)
  305. # ============================================================================
  306. # Parameter Combinations
  307. # ============================================================================
  308. @pytest.mark.offline
  309. def test_different_delimiter_newline():
  310. """Test with single newline delimiter."""
  311. tokenizer = make_tokenizer()
  312. content = "alpha\nbeta\ngamma"
  313. chunks = chunking_by_token_size(
  314. tokenizer,
  315. content,
  316. split_by_character="\n",
  317. split_by_character_only=True,
  318. chunk_token_size=10,
  319. )
  320. assert len(chunks) == 3
  321. assert [c["content"] for c in chunks] == ["alpha", "beta", "gamma"]
  322. @pytest.mark.offline
  323. def test_delimiter_based_splitting_verification():
  324. """
  325. Verify that chunks are actually split at delimiter positions.
  326. This test ensures split_by_character truly splits at the delimiter,
  327. not at arbitrary positions.
  328. """
  329. tokenizer = make_tokenizer()
  330. # Content with clear delimiter boundaries
  331. content = "part1||part2||part3||part4"
  332. chunks = chunking_by_token_size(
  333. tokenizer,
  334. content,
  335. split_by_character="||",
  336. split_by_character_only=True,
  337. chunk_token_size=20,
  338. )
  339. # Should split exactly at || delimiters
  340. assert len(chunks) == 4
  341. assert chunks[0]["content"] == "part1"
  342. assert chunks[1]["content"] == "part2"
  343. assert chunks[2]["content"] == "part3"
  344. assert chunks[3]["content"] == "part4"
  345. # Verify delimiter is not included in chunks
  346. for chunk in chunks:
  347. assert "||" not in chunk["content"]
  348. @pytest.mark.offline
  349. def test_multi_character_delimiter_splitting():
  350. """
  351. Verify that multi-character delimiters are correctly recognized and not partially matched.
  352. Tests various multi-character delimiter scenarios to ensure the entire delimiter
  353. sequence is used for splitting, not individual characters.
  354. """
  355. tokenizer = make_tokenizer()
  356. # Test 1: Multi-character delimiter that contains single chars also present elsewhere
  357. content = "data<SEP>more<SEP>final"
  358. chunks = chunking_by_token_size(
  359. tokenizer,
  360. content,
  361. split_by_character="<SEP>",
  362. split_by_character_only=True,
  363. chunk_token_size=50,
  364. )
  365. assert len(chunks) == 3
  366. assert chunks[0]["content"] == "data"
  367. assert chunks[1]["content"] == "more"
  368. assert chunks[2]["content"] == "final"
  369. # Verify full delimiter is not in chunks, not just parts
  370. for chunk in chunks:
  371. assert "<SEP>" not in chunk["content"]
  372. # Test 2: Delimiter appears in middle of content
  373. content = "first><second><third"
  374. chunks = chunking_by_token_size(
  375. tokenizer,
  376. content,
  377. split_by_character="><", # Multi-char delimiter
  378. split_by_character_only=True,
  379. chunk_token_size=50,
  380. )
  381. # Should split at "><" delimiter
  382. assert len(chunks) == 3
  383. assert chunks[0]["content"] == "first"
  384. assert chunks[1]["content"] == "second"
  385. assert chunks[2]["content"] == "third"
  386. # Test 3: Three-character delimiter
  387. content = "section1[***]section2[***]section3"
  388. chunks = chunking_by_token_size(
  389. tokenizer,
  390. content,
  391. split_by_character="[***]",
  392. split_by_character_only=True,
  393. chunk_token_size=50,
  394. )
  395. assert len(chunks) == 3
  396. assert chunks[0]["content"] == "section1"
  397. assert chunks[1]["content"] == "section2"
  398. assert chunks[2]["content"] == "section3"
  399. # Test 4: Delimiter with special regex characters (should be treated literally)
  400. content = "partA...partB...partC"
  401. chunks = chunking_by_token_size(
  402. tokenizer,
  403. content,
  404. split_by_character="...",
  405. split_by_character_only=True,
  406. chunk_token_size=50,
  407. )
  408. assert len(chunks) == 3
  409. assert chunks[0]["content"] == "partA"
  410. assert chunks[1]["content"] == "partB"
  411. assert chunks[2]["content"] == "partC"
  412. @pytest.mark.offline
  413. def test_delimiter_partial_match_not_split():
  414. """
  415. Verify that partial matches of multi-character delimiters don't cause splits.
  416. Only the complete delimiter sequence should trigger a split.
  417. """
  418. tokenizer = make_tokenizer()
  419. # Content contains "||" delimiter but also contains single "|"
  420. content = "data|single||data|with|pipes||final"
  421. chunks = chunking_by_token_size(
  422. tokenizer,
  423. content,
  424. split_by_character="||", # Only split on double pipe
  425. split_by_character_only=True,
  426. chunk_token_size=50,
  427. )
  428. # Should split only at "||", not at single "|"
  429. assert len(chunks) == 3
  430. assert chunks[0]["content"] == "data|single"
  431. assert chunks[1]["content"] == "data|with|pipes"
  432. assert chunks[2]["content"] == "final"
  433. # Single "|" should remain in content, but not double "||"
  434. assert "|" in chunks[0]["content"]
  435. assert "|" in chunks[1]["content"]
  436. assert "||" not in chunks[0]["content"]
  437. assert "||" not in chunks[1]["content"]
  438. @pytest.mark.offline
  439. def test_no_delimiter_forces_token_based_split():
  440. """
  441. Verify that when split_by_character doesn't appear in content,
  442. chunking falls back to token-based splitting.
  443. """
  444. tokenizer = make_tokenizer()
  445. # Content without the specified delimiter
  446. content = "0123456789abcdefghijklmnop" # 26 chars, no "\n\n"
  447. chunks = chunking_by_token_size(
  448. tokenizer,
  449. content,
  450. split_by_character="\n\n", # Delimiter not in content
  451. split_by_character_only=False,
  452. chunk_token_size=10,
  453. chunk_overlap_token_size=0,
  454. )
  455. # Should fall back to token-based splitting
  456. assert len(chunks) == 3
  457. assert chunks[0]["content"] == "0123456789" # [0:10]
  458. assert chunks[1]["content"] == "abcdefghij" # [10:20]
  459. assert chunks[2]["content"] == "klmnop" # [20:26]
  460. # Verify it didn't somehow split at the delimiter that doesn't exist
  461. for chunk in chunks:
  462. assert "\n\n" not in chunk["content"]
  463. @pytest.mark.offline
  464. def test_delimiter_at_exact_chunk_boundary():
  465. """
  466. Verify correct behavior when delimiter appears exactly at chunk token limit.
  467. """
  468. tokenizer = make_tokenizer()
  469. # "segment1\n\nsegment2" where each segment is within limit
  470. content = "12345\n\nabcde"
  471. chunks = chunking_by_token_size(
  472. tokenizer,
  473. content,
  474. split_by_character="\n\n",
  475. split_by_character_only=True,
  476. chunk_token_size=10,
  477. )
  478. # Should split at delimiter, not at token count
  479. assert len(chunks) == 2
  480. assert chunks[0]["content"] == "12345"
  481. assert chunks[1]["content"] == "abcde"
  482. @pytest.mark.offline
  483. def test_different_delimiter_comma():
  484. """Test with comma delimiter."""
  485. tokenizer = make_tokenizer()
  486. content = "one,two,three"
  487. chunks = chunking_by_token_size(
  488. tokenizer,
  489. content,
  490. split_by_character=",",
  491. split_by_character_only=True,
  492. chunk_token_size=10,
  493. )
  494. assert len(chunks) == 3
  495. assert [c["content"] for c in chunks] == ["one", "two", "three"]
  496. @pytest.mark.offline
  497. def test_zero_overlap():
  498. """Test with zero overlap (no overlap)."""
  499. tokenizer = make_tokenizer()
  500. content = "a" * 20
  501. chunks = chunking_by_token_size(
  502. tokenizer,
  503. content,
  504. split_by_character=None,
  505. split_by_character_only=False,
  506. chunk_token_size=10,
  507. chunk_overlap_token_size=0,
  508. )
  509. # Should create exactly 2 chunks with no overlap
  510. assert len(chunks) == 2
  511. assert chunks[0]["tokens"] == 10
  512. assert chunks[1]["tokens"] == 10
  513. @pytest.mark.offline
  514. def test_large_overlap():
  515. """
  516. Test with overlap close to chunk size using distinctive content.
  517. Large overlap (9 out of 10) means step size is only 1, creating many overlapping chunks.
  518. Distinctive characters ensure each chunk has correct positioning.
  519. """
  520. tokenizer = make_tokenizer()
  521. # Use distinctive characters to verify exact positions
  522. content = "0123456789abcdefghijklmnopqrst" # 30 chars
  523. chunks = chunking_by_token_size(
  524. tokenizer,
  525. content,
  526. split_by_character=None,
  527. split_by_character_only=False,
  528. chunk_token_size=10,
  529. chunk_overlap_token_size=9,
  530. )
  531. # With overlap=9, step size = 10 - 9 = 1
  532. # Chunks start at: 0, 1, 2, 3, ..., 20
  533. # Total chunks = 21 (from position 0 to 20, each taking 10 tokens)
  534. # Wait, let me recalculate: range(0, 30, 1) gives positions 0-29
  535. # But each chunk is 10 tokens, so last chunk starts at position 20
  536. # Actually: positions are 0, 1, 2, ..., 20 (21 chunks) for a 30-char string
  537. # No wait: for i in range(0, 30, 1): if i + 10 <= 30, we can create a chunk
  538. # So positions: 0-20 (chunks of size 10), then 21-29 would be partial
  539. # Actually the loop is: for start in range(0, len(tokens), step):
  540. # range(0, 30, 1) = [0, 1, 2, ..., 29], so 30 chunks total
  541. assert len(chunks) == 30
  542. # Verify first few chunks have correct content with proper overlap
  543. assert chunks[0]["content"] == "0123456789" # [0:10]
  544. assert (
  545. chunks[1]["content"] == "123456789a"
  546. ) # [1:11] - overlaps 9 chars with previous
  547. assert (
  548. chunks[2]["content"] == "23456789ab"
  549. ) # [2:12] - overlaps 9 chars with previous
  550. assert chunks[3]["content"] == "3456789abc" # [3:13]
  551. # Verify last chunk
  552. assert chunks[-1]["content"] == "t" # [29:30] - last char only
  553. # ============================================================================
  554. # Chunk Order Index Tests
  555. # ============================================================================
  556. @pytest.mark.offline
  557. def test_chunk_order_index_simple():
  558. """Test that chunk_order_index is correctly assigned."""
  559. tokenizer = make_tokenizer()
  560. content = "a\n\nb\n\nc"
  561. chunks = chunking_by_token_size(
  562. tokenizer,
  563. content,
  564. split_by_character="\n\n",
  565. split_by_character_only=True,
  566. chunk_token_size=10,
  567. )
  568. assert len(chunks) == 3
  569. assert chunks[0]["chunk_order_index"] == 0
  570. assert chunks[1]["chunk_order_index"] == 1
  571. assert chunks[2]["chunk_order_index"] == 2
  572. @pytest.mark.offline
  573. def test_chunk_order_index_with_splitting():
  574. """Test chunk_order_index with recursive splitting."""
  575. tokenizer = make_tokenizer()
  576. content = "a" * 30
  577. chunks = chunking_by_token_size(
  578. tokenizer,
  579. content,
  580. split_by_character=None,
  581. split_by_character_only=False,
  582. chunk_token_size=10,
  583. chunk_overlap_token_size=0,
  584. )
  585. assert len(chunks) == 3
  586. assert chunks[0]["chunk_order_index"] == 0
  587. assert chunks[1]["chunk_order_index"] == 1
  588. assert chunks[2]["chunk_order_index"] == 2
  589. # ============================================================================
  590. # Integration Tests
  591. # ============================================================================
  592. @pytest.mark.offline
  593. def test_mixed_size_chunks_no_error():
  594. """Test that mixed size chunks work without error in recursive mode."""
  595. tokenizer = make_tokenizer()
  596. # Mix of small and large chunks
  597. content = "small\n\n" + "a" * 50 + "\n\nmedium"
  598. chunks = chunking_by_token_size(
  599. tokenizer,
  600. content,
  601. split_by_character="\n\n",
  602. split_by_character_only=False,
  603. chunk_token_size=10,
  604. chunk_overlap_token_size=2,
  605. )
  606. # Should handle all chunks without error
  607. assert len(chunks) > 0
  608. # Small chunk should remain intact
  609. assert chunks[0]["content"] == "small"
  610. # Large chunk should be split into multiple pieces
  611. assert any(chunk["content"] == "a" * 10 for chunk in chunks)
  612. # Last chunk should contain "medium"
  613. assert any("medium" in chunk["content"] for chunk in chunks)
  614. @pytest.mark.offline
  615. def test_whitespace_handling():
  616. """Test that whitespace is properly handled in chunk content."""
  617. tokenizer = make_tokenizer()
  618. content = " alpha \n\n beta "
  619. chunks = chunking_by_token_size(
  620. tokenizer,
  621. content,
  622. split_by_character="\n\n",
  623. split_by_character_only=True,
  624. chunk_token_size=20,
  625. )
  626. # Content should be stripped
  627. assert chunks[0]["content"] == "alpha"
  628. assert chunks[1]["content"] == "beta"
  629. @pytest.mark.offline
  630. def test_consecutive_delimiters():
  631. """Test handling of consecutive delimiters."""
  632. tokenizer = make_tokenizer()
  633. content = "alpha\n\n\n\nbeta"
  634. chunks = chunking_by_token_size(
  635. tokenizer,
  636. content,
  637. split_by_character="\n\n",
  638. split_by_character_only=True,
  639. chunk_token_size=20,
  640. )
  641. # Should split on delimiter and include empty chunks
  642. assert len(chunks) >= 2
  643. assert "alpha" in [c["content"] for c in chunks]
  644. assert "beta" in [c["content"] for c in chunks]
  645. # ============================================================================
  646. # Token vs Character Counting Tests (Multi-Token Characters)
  647. # ============================================================================
  648. @pytest.mark.offline
  649. def test_token_counting_not_character_counting():
  650. """
  651. Verify chunking uses token count, not character count.
  652. With MultiTokenCharacterTokenizer:
  653. - "aXa" = 3 chars but 4 tokens (a=1, X=2, a=1)
  654. This test would PASS if code incorrectly used character count (3 <= 3)
  655. but correctly FAILS because token count (4 > 3).
  656. """
  657. tokenizer = make_multi_token_tokenizer()
  658. # "aXa" = 3 characters, 4 tokens
  659. content = "aXa"
  660. with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
  661. chunking_by_token_size(
  662. tokenizer,
  663. content,
  664. split_by_character="\n\n",
  665. split_by_character_only=True,
  666. chunk_token_size=3, # 3 token limit
  667. )
  668. err = excinfo.value
  669. assert err.chunk_tokens == 4 # Should be 4 tokens, not 3 characters
  670. assert err.chunk_token_limit == 3
  671. @pytest.mark.offline
  672. def test_token_limit_with_punctuation():
  673. """
  674. Test that punctuation token expansion is handled correctly.
  675. "Hi!" = 3 chars but 6 tokens (H=2, i=1, !=3)
  676. """
  677. tokenizer = make_multi_token_tokenizer()
  678. # "Hi!" = 3 characters, 6 tokens (H=2, i=1, !=3)
  679. content = "Hi!"
  680. with pytest.raises(ChunkTokenLimitExceededError) as excinfo:
  681. chunking_by_token_size(
  682. tokenizer,
  683. content,
  684. split_by_character="\n\n",
  685. split_by_character_only=True,
  686. chunk_token_size=4,
  687. )
  688. err = excinfo.value
  689. assert err.chunk_tokens == 6
  690. assert err.chunk_token_limit == 4
  691. @pytest.mark.offline
  692. def test_multi_token_within_limit():
  693. """Test that multi-token characters work when within limit."""
  694. tokenizer = make_multi_token_tokenizer()
  695. # "Hi" = 2 chars, 3 tokens (H=2, i=1)
  696. content = "Hi"
  697. chunks = chunking_by_token_size(
  698. tokenizer,
  699. content,
  700. split_by_character="\n\n",
  701. split_by_character_only=True,
  702. chunk_token_size=5,
  703. )
  704. assert len(chunks) == 1
  705. assert chunks[0]["tokens"] == 3
  706. assert chunks[0]["content"] == "Hi"
  707. @pytest.mark.offline
  708. def test_recursive_split_with_multi_token_chars():
  709. """
  710. Test recursive splitting respects token boundaries, not character boundaries.
  711. "AAAAA" = 5 chars but 10 tokens (each A = 2 tokens)
  712. With chunk_size=6, should split at token positions, not character positions.
  713. """
  714. tokenizer = make_multi_token_tokenizer()
  715. # "AAAAA" = 5 characters, 10 tokens
  716. content = "AAAAA"
  717. chunks = chunking_by_token_size(
  718. tokenizer,
  719. content,
  720. split_by_character="\n\n",
  721. split_by_character_only=False,
  722. chunk_token_size=6,
  723. chunk_overlap_token_size=0,
  724. )
  725. # Should split into: [0:6]=3 chars, [6:10]=2 chars
  726. # Not [0:3]=6 tokens, [3:5]=4 tokens (character-based would be wrong)
  727. assert len(chunks) == 2
  728. assert chunks[0]["tokens"] == 6
  729. assert chunks[1]["tokens"] == 4
  730. @pytest.mark.offline
  731. def test_overlap_uses_token_count():
  732. """
  733. Verify overlap calculation uses token count, not character count.
  734. "aAaAa" = 5 chars, 7 tokens (a=1, A=2, a=1, A=2, a=1)
  735. """
  736. tokenizer = make_multi_token_tokenizer()
  737. # "aAaAa" = 5 characters, 7 tokens
  738. content = "aAaAa"
  739. chunks = chunking_by_token_size(
  740. tokenizer,
  741. content,
  742. split_by_character="\n\n",
  743. split_by_character_only=False,
  744. chunk_token_size=4,
  745. chunk_overlap_token_size=2,
  746. )
  747. # Chunks start at token positions: 0, 2, 4, 6
  748. # [0:4]=2 chars, [2:6]=2.5 chars, [4:7]=1.5 chars
  749. assert len(chunks) == 4
  750. assert chunks[0]["tokens"] == 4
  751. assert chunks[1]["tokens"] == 4
  752. assert chunks[2]["tokens"] == 3
  753. assert chunks[3]["tokens"] == 1
  754. @pytest.mark.offline
  755. def test_mixed_multi_token_content():
  756. """Test chunking with mixed single and multi-token characters."""
  757. tokenizer = make_multi_token_tokenizer()
  758. # "hello\n\nWORLD!" = 12 chars
  759. # hello = 5 tokens, WORLD = 10 tokens (5 chars × 2), ! = 3 tokens
  760. # Total = 18 tokens
  761. content = "hello\n\nWORLD!"
  762. chunks = chunking_by_token_size(
  763. tokenizer,
  764. content,
  765. split_by_character="\n\n",
  766. split_by_character_only=True,
  767. chunk_token_size=20,
  768. )
  769. assert len(chunks) == 2
  770. assert chunks[0]["content"] == "hello"
  771. assert chunks[0]["tokens"] == 5
  772. assert chunks[1]["content"] == "WORLD!"
  773. assert chunks[1]["tokens"] == 13 # 10 + 3
  774. @pytest.mark.offline
  775. def test_exact_token_boundary_multi_token():
  776. """Test splitting exactly at token limit with multi-token characters."""
  777. tokenizer = make_multi_token_tokenizer()
  778. # "AAA" = 3 chars, 6 tokens (each A = 2 tokens)
  779. content = "AAA"
  780. chunks = chunking_by_token_size(
  781. tokenizer,
  782. content,
  783. split_by_character="\n\n",
  784. split_by_character_only=True,
  785. chunk_token_size=6,
  786. )
  787. assert len(chunks) == 1
  788. assert chunks[0]["tokens"] == 6
  789. assert chunks[0]["content"] == "AAA"
  790. @pytest.mark.offline
  791. def test_multi_token_overlap_with_distinctive_content():
  792. """
  793. Verify overlap works correctly with multi-token characters using distinctive content.
  794. With non-uniform tokenization, overlap must be calculated in token space, not character space.
  795. Distinctive characters ensure we catch any misalignment.
  796. Content: "abcABCdef"
  797. - "abc" = 3 tokens (1+1+1)
  798. - "ABC" = 6 tokens (2+2+2)
  799. - "def" = 3 tokens (1+1+1)
  800. - Total = 12 tokens
  801. """
  802. tokenizer = make_multi_token_tokenizer()
  803. # Distinctive content with mixed single and multi-token chars
  804. content = "abcABCdef" # 9 chars, 12 tokens
  805. chunks = chunking_by_token_size(
  806. tokenizer,
  807. content,
  808. split_by_character=None,
  809. split_by_character_only=False,
  810. chunk_token_size=6,
  811. chunk_overlap_token_size=2,
  812. )
  813. # With chunk_size=6, overlap=2, step=4
  814. # Chunks start at token positions: 0, 4, 8
  815. # Chunk 0: tokens [0:6] = "abcA" (tokens: a=1, b=1, c=1, A=2, total=5... wait)
  816. # Let me recalculate:
  817. # "a"=1, "b"=1, "c"=1, "A"=2, "B"=2, "C"=2, "d"=1, "e"=1, "f"=1
  818. # Token positions: a=0, b=1, c=2, A=3-4, B=5-6, C=7-8, d=9, e=10, f=11
  819. # Chunk 0 [0:6]: covers "abc" (tokens 0-2) + partial "ABC" (tokens 3-5, which is "AB")
  820. # But we need to figure out what characters that maps to...
  821. #
  822. # Actually, let's think in terms of token slicing:
  823. # tokens = [a, b, c, A1, A2, B1, B2, C1, C2, d, e, f]
  824. # Chunk 0 [0:6]: [a, b, c, A1, A2, B1] - decode to "abcAB"
  825. # Chunk 1 [4:10]: [A2, B1, B2, C1, C2, d] - decode to "ABCd"
  826. # Chunk 2 [8:12]: [C2, d, e, f] - decode to... this is problematic
  827. #
  828. # The issue is that multi-token characters might get split across chunks.
  829. # Let me verify what the actual chunking does...
  830. assert len(chunks) == 3
  831. # Just verify token counts are correct - content may vary due to character splitting
  832. assert chunks[0]["tokens"] == 6
  833. assert chunks[1]["tokens"] == 6
  834. assert chunks[2]["tokens"] == 4
  835. @pytest.mark.offline
  836. def test_decode_preserves_content():
  837. """Verify that decode correctly reconstructs original content."""
  838. tokenizer = make_multi_token_tokenizer()
  839. test_strings = [
  840. "Hello",
  841. "WORLD",
  842. "Test!",
  843. "Mixed?Case.",
  844. "ABC123xyz",
  845. ]
  846. for original in test_strings:
  847. tokens = tokenizer.encode(original)
  848. decoded = tokenizer.decode(tokens)
  849. assert decoded == original, f"Failed to decode: {original}"