test_chunker_recursive_character.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. """Unit tests for ``chunking_by_recursive_character`` (process_options=R)."""
  2. import pytest
  3. pytest.importorskip("langchain_text_splitters")
  4. from lightrag.chunker import chunking_by_recursive_character # noqa: E402
  5. from lightrag.utils import Tokenizer, TokenizerInterface # noqa: E402
  6. class _CharTokenizer(TokenizerInterface):
  7. """1 char ≈ 1 token; lets assertions reason in terms of input length."""
  8. def encode(self, content: str):
  9. return [ord(ch) for ch in content]
  10. def decode(self, tokens):
  11. return "".join(chr(t) for t in tokens)
  12. def _tok() -> Tokenizer:
  13. return Tokenizer("char-tokenizer", _CharTokenizer())
  14. @pytest.mark.offline
  15. def test_empty_input_returns_empty_list():
  16. chunks = chunking_by_recursive_character(_tok(), "")
  17. assert chunks == []
  18. @pytest.mark.offline
  19. def test_short_input_single_chunk():
  20. body = "Para A.\n\nPara B."
  21. chunks = chunking_by_recursive_character(_tok(), body, chunk_token_size=1000)
  22. assert len(chunks) == 1
  23. assert chunks[0]["content"] == body
  24. assert chunks[0]["chunk_order_index"] == 0
  25. assert chunks[0]["tokens"] == len(body)
  26. @pytest.mark.offline
  27. def test_paragraph_separator_used_first():
  28. """``\\n\\n`` is the first separator in the default cascade — three
  29. paragraphs that each fit under the cap should split exactly there."""
  30. body = "Alpha section.\n\nBeta section.\n\nGamma section."
  31. chunks = chunking_by_recursive_character(
  32. _tok(),
  33. body,
  34. chunk_token_size=20,
  35. chunk_overlap_token_size=0,
  36. )
  37. assert [c["chunk_order_index"] for c in chunks] == list(range(len(chunks)))
  38. assert all(c["content"].strip() for c in chunks)
  39. # Reconstructed (joined with the splitter's separator semantics) must
  40. # at least contain each original paragraph as a substring.
  41. joined = "\n\n".join(c["content"] for c in chunks)
  42. for para in ("Alpha section.", "Beta section.", "Gamma section."):
  43. assert para in joined
  44. @pytest.mark.offline
  45. def test_token_field_matches_tokenizer_encode_length():
  46. chunks = chunking_by_recursive_character(
  47. _tok(),
  48. "X" * 50 + "\n\n" + "Y" * 50,
  49. chunk_token_size=40,
  50. chunk_overlap_token_size=5,
  51. )
  52. tok = _tok()
  53. for c in chunks:
  54. assert c["tokens"] == len(tok.encode(c["content"]))
  55. @pytest.mark.offline
  56. def test_custom_separators_are_honored():
  57. body = "alpha|beta|gamma|delta"
  58. chunks = chunking_by_recursive_character(
  59. _tok(),
  60. body,
  61. chunk_token_size=10,
  62. chunk_overlap_token_size=0,
  63. separators=["|", ""],
  64. )
  65. contents = [c["content"] for c in chunks]
  66. # With "|" as the primary separator and a 10-token cap, each 5-char
  67. # token name must land in its own chunk.
  68. assert any("alpha" in c for c in contents)
  69. assert any("delta" in c for c in contents)
  70. # Every chunk fits the cap.
  71. for c in chunks:
  72. assert c["tokens"] <= 10