| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 |
- """Regression tests for paragraph-semantic Stage D merging and the top-level R fallback."""
- import pytest
- from lightrag.chunker.paragraph_semantic import (
- _merge_small_blocks,
- chunking_by_paragraph_semantic,
- )
- from lightrag.utils import Tokenizer, TokenizerInterface
- class _CharTokenizer(TokenizerInterface):
- """1:1 character-to-token mapping — keeps math obvious in assertions."""
- def encode(self, content: str):
- return [ord(ch) for ch in content]
- def decode(self, tokens):
- return "".join(chr(t) for t in tokens)
- def _make_tokenizer() -> Tokenizer:
- return Tokenizer(model_name="char", tokenizer=_CharTokenizer())
- def _make_block(text: str, *, tokenizer: Tokenizer, level: int = 1) -> dict:
- return {
- "heading": "H",
- "parent_headings": [],
- "level": level,
- "paragraphs": [{"text": text, "is_table": False}],
- "content": text,
- "tokens": len(tokenizer.encode(text)),
- "table_chunk_role": "none",
- }
- @pytest.mark.offline
- def test_tail_absorption_rejects_when_separator_pushes_over_cap():
- # Tail absorption joins blocks with ``"\n\n"`` but the original
- # predicate only summed per-block tokens. With cur=99 and tail=1
- # the raw sum equals target_max=100, but the actual joined
- # ``"x"*99 + "\n\n" + "y"*1`` measures 102 tokens — the absorbed
- # block silently overflowed before the fix re-measured the joined
- # content.
- tokenizer = _make_tokenizer()
- blocks = [
- _make_block("x" * 99, tokenizer=tokenizer),
- _make_block("y" * 1, tokenizer=tokenizer),
- ]
- merged = _merge_small_blocks(
- blocks,
- tokenizer=tokenizer,
- target_max=100,
- target_ideal=80,
- small_tail_threshold=12,
- )
- assert all(b["tokens"] <= 100 for b in merged), [b["tokens"] for b in merged]
- @pytest.mark.offline
- def test_tail_absorption_still_fires_when_joined_size_fits():
- # Sanity check: when the joined content (including separators)
- # genuinely fits target_max, absorption still happens. cur=80 +
- # "\n\n" (2 tokens) + tail=1 = 83 ≤ 100.
- tokenizer = _make_tokenizer()
- blocks = [
- _make_block("x" * 80, tokenizer=tokenizer),
- _make_block("y" * 1, tokenizer=tokenizer),
- ]
- merged = _merge_small_blocks(
- blocks,
- tokenizer=tokenizer,
- target_max=100,
- target_ideal=80,
- small_tail_threshold=12,
- )
- assert len(merged) == 1
- assert merged[0]["tokens"] == 83
- assert merged[0]["content"] == "x" * 80 + "\n\n" + "y" * 1
- @pytest.mark.offline
- def test_paragraph_semantic_fallback_passes_configured_recursive_overlap(monkeypatch):
- # When ``blocks_path`` is missing, paragraph-semantic chunking
- # delegates to ``chunking_by_recursive_character``. P now permits
- # overlap for long text under one JSONL row, so the fallback must
- # pass through the configured overlap rather than forcing zero.
- captured: dict[str, object] = {}
- def fake_chunker(
- tokenizer,
- content,
- chunk_token_size: int = 1200,
- *,
- chunk_overlap_token_size: int = 100,
- separators=None,
- ):
- captured["chunk_overlap_token_size"] = chunk_overlap_token_size
- captured["chunk_token_size"] = chunk_token_size
- return [
- {
- "tokens": len(tokenizer.encode(content)),
- "content": content,
- "chunk_order_index": 0,
- }
- ]
- import lightrag.chunker.recursive_character as rc_mod
- monkeypatch.setattr(rc_mod, "chunking_by_recursive_character", fake_chunker)
- tokenizer = _make_tokenizer()
- chunking_by_paragraph_semantic(
- tokenizer,
- "fallback corpus",
- chunk_token_size=500,
- blocks_path=None,
- chunk_overlap_token_size=37,
- )
- assert (
- captured.get("chunk_overlap_token_size") == 37
- ), "P→R fallback must pass the configured chunk_overlap_token_size"
- assert captured.get("chunk_token_size") == 500
|