| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293 |
- """Unit tests for ``chunking_by_recursive_character`` (process_options=R)."""
- import pytest
- pytest.importorskip("langchain_text_splitters")
- from lightrag.chunker import chunking_by_recursive_character # noqa: E402
- from lightrag.utils import Tokenizer, TokenizerInterface # noqa: E402
- class _CharTokenizer(TokenizerInterface):
- """1 char ≈ 1 token; lets assertions reason in terms of input length."""
- def encode(self, content: str):
- return [ord(ch) for ch in content]
- def decode(self, tokens):
- return "".join(chr(t) for t in tokens)
- def _tok() -> Tokenizer:
- return Tokenizer("char-tokenizer", _CharTokenizer())
- @pytest.mark.offline
- def test_empty_input_returns_empty_list():
- chunks = chunking_by_recursive_character(_tok(), "")
- assert chunks == []
- @pytest.mark.offline
- def test_short_input_single_chunk():
- body = "Para A.\n\nPara B."
- chunks = chunking_by_recursive_character(_tok(), body, chunk_token_size=1000)
- assert len(chunks) == 1
- assert chunks[0]["content"] == body
- assert chunks[0]["chunk_order_index"] == 0
- assert chunks[0]["tokens"] == len(body)
- @pytest.mark.offline
- def test_paragraph_separator_used_first():
- """``\\n\\n`` is the first separator in the default cascade — three
- paragraphs that each fit under the cap should split exactly there."""
- body = "Alpha section.\n\nBeta section.\n\nGamma section."
- chunks = chunking_by_recursive_character(
- _tok(),
- body,
- chunk_token_size=20,
- chunk_overlap_token_size=0,
- )
- assert [c["chunk_order_index"] for c in chunks] == list(range(len(chunks)))
- assert all(c["content"].strip() for c in chunks)
- # Reconstructed (joined with the splitter's separator semantics) must
- # at least contain each original paragraph as a substring.
- joined = "\n\n".join(c["content"] for c in chunks)
- for para in ("Alpha section.", "Beta section.", "Gamma section."):
- assert para in joined
- @pytest.mark.offline
- def test_token_field_matches_tokenizer_encode_length():
- chunks = chunking_by_recursive_character(
- _tok(),
- "X" * 50 + "\n\n" + "Y" * 50,
- chunk_token_size=40,
- chunk_overlap_token_size=5,
- )
- tok = _tok()
- for c in chunks:
- assert c["tokens"] == len(tok.encode(c["content"]))
- @pytest.mark.offline
- def test_custom_separators_are_honored():
- body = "alpha|beta|gamma|delta"
- chunks = chunking_by_recursive_character(
- _tok(),
- body,
- chunk_token_size=10,
- chunk_overlap_token_size=0,
- separators=["|", ""],
- )
- contents = [c["content"] for c in chunks]
- # With "|" as the primary separator and a 10-token cap, each 5-char
- # token name must land in its own chunk.
- assert any("alpha" in c for c in contents)
- assert any("delta" in c for c in contents)
- # Every chunk fits the cap.
- for c in chunks:
- assert c["tokens"] <= 10
|