wxcz_admin
/
lightrag-cn-git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
							"""Regression tests for paragraph-semantic Stage D merging and the top-level R fallback."""

import pytest

from lightrag.chunker.paragraph_semantic import (
    _merge_small_blocks,
    chunking_by_paragraph_semantic,
)
from lightrag.utils import Tokenizer, TokenizerInterface


class _CharTokenizer(TokenizerInterface):
    """1:1 character-to-token mapping — keeps math obvious in assertions."""

    def encode(self, content: str):
        return [ord(ch) for ch in content]

    def decode(self, tokens):
        return "".join(chr(t) for t in tokens)


def _make_tokenizer() -> Tokenizer:
    return Tokenizer(model_name="char", tokenizer=_CharTokenizer())


def _make_block(text: str, *, tokenizer: Tokenizer, level: int = 1) -> dict:
    return {
        "heading": "H",
        "parent_headings": [],
        "level": level,
        "paragraphs": [{"text": text, "is_table": False}],
        "content": text,
        "tokens": len(tokenizer.encode(text)),
        "table_chunk_role": "none",
    }


@pytest.mark.offline
def test_tail_absorption_rejects_when_separator_pushes_over_cap():
    # Tail absorption joins blocks with ``"\n\n"`` but the original
    # predicate only summed per-block tokens. With cur=99 and tail=1
    # the raw sum equals target_max=100, but the actual joined
    # ``"x"*99 + "\n\n" + "y"*1`` measures 102 tokens — the absorbed
    # block silently overflowed before the fix re-measured the joined
    # content.
    tokenizer = _make_tokenizer()
    blocks = [
        _make_block("x" * 99, tokenizer=tokenizer),
        _make_block("y" * 1, tokenizer=tokenizer),
    ]

    merged = _merge_small_blocks(
        blocks,
        tokenizer=tokenizer,
        target_max=100,
        target_ideal=80,
        small_tail_threshold=12,
    )

    assert all(b["tokens"] <= 100 for b in merged), [b["tokens"] for b in merged]


@pytest.mark.offline
def test_tail_absorption_still_fires_when_joined_size_fits():
    # Sanity check: when the joined content (including separators)
    # genuinely fits target_max, absorption still happens. cur=80 +
    # "\n\n" (2 tokens) + tail=1 = 83 ≤ 100.
    tokenizer = _make_tokenizer()
    blocks = [
        _make_block("x" * 80, tokenizer=tokenizer),
        _make_block("y" * 1, tokenizer=tokenizer),
    ]

    merged = _merge_small_blocks(
        blocks,
        tokenizer=tokenizer,
        target_max=100,
        target_ideal=80,
        small_tail_threshold=12,
    )

    assert len(merged) == 1
    assert merged[0]["tokens"] == 83
    assert merged[0]["content"] == "x" * 80 + "\n\n" + "y" * 1


@pytest.mark.offline
def test_paragraph_semantic_fallback_passes_configured_recursive_overlap(monkeypatch):
    # When ``blocks_path`` is missing, paragraph-semantic chunking
    # delegates to ``chunking_by_recursive_character``. P now permits
    # overlap for long text under one JSONL row, so the fallback must
    # pass through the configured overlap rather than forcing zero.
    captured: dict[str, object] = {}

    def fake_chunker(
        tokenizer,
        content,
        chunk_token_size: int = 1200,
        *,
        chunk_overlap_token_size: int = 100,
        separators=None,
    ):
        captured["chunk_overlap_token_size"] = chunk_overlap_token_size
        captured["chunk_token_size"] = chunk_token_size
        return [
            {
                "tokens": len(tokenizer.encode(content)),
                "content": content,
                "chunk_order_index": 0,
            }
        ]

    import lightrag.chunker.recursive_character as rc_mod

    monkeypatch.setattr(rc_mod, "chunking_by_recursive_character", fake_chunker)

    tokenizer = _make_tokenizer()
    chunking_by_paragraph_semantic(
        tokenizer,
        "fallback corpus",
        chunk_token_size=500,
        blocks_path=None,
        chunk_overlap_token_size=37,
    )

    assert (
        captured.get("chunk_overlap_token_size") == 37
    ), "P→R fallback must pass the configured chunk_overlap_token_size"
    assert captured.get("chunk_token_size") == 500