wxcz_admin
/
lightrag-cn-git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
							"""Unit tests for ``chunking_by_semantic_vector`` (process_options=V)."""

import asyncio
import logging

import numpy as np
import pytest

pytest.importorskip("langchain_experimental")

from lightrag.chunker import chunking_by_semantic_vector  # noqa: E402
from lightrag.utils import EmbeddingFunc, Tokenizer, TokenizerInterface  # noqa: E402


class _CharTokenizer(TokenizerInterface):
    """1 char ≈ 1 token."""

    def encode(self, content: str):
        return [ord(ch) for ch in content]

    def decode(self, tokens):
        return "".join(chr(t) for t in tokens)


def _tok() -> Tokenizer:
    return Tokenizer("char-tokenizer", _CharTokenizer())


def _make_deterministic_embedding(dim: int = 8) -> EmbeddingFunc:
    """A toy async embedding func that hashes each input text into a
    stable unit vector — enough to drive SemanticChunker without needing
    a real model."""

    async def _embed(texts, **kwargs):
        rng = np.random.default_rng(seed=0)
        # Use a simple hash → seeded rng to get reproducible vectors per text.
        rows = []
        for text in texts:
            seed = abs(hash(text)) % (2**32)
            rng = np.random.default_rng(seed=seed)
            vec = rng.normal(size=dim).astype(np.float32)
            vec /= np.linalg.norm(vec) or 1.0
            rows.append(vec)
        return np.vstack(rows)

    return EmbeddingFunc(embedding_dim=dim, max_token_size=4096, func=_embed)


@pytest.mark.offline
def test_v_chunker_runs_with_stub_embedding():
    """Async chunker should split a multi-sentence body into ≥1 chunk
    when given a working embedding func."""
    body = (
        "Quantum mechanics describes nature at small scales. "
        "It contradicts classical intuition. "
        "Bread is baked from flour. "
        "Sourdough requires a long fermentation. "
    )

    async def _run():
        chunks = await chunking_by_semantic_vector(
            _tok(),
            body,
            chunk_token_size=200,
            embedding_func=_make_deterministic_embedding(),
        )
        return chunks

    chunks = asyncio.run(_run())

    assert len(chunks) >= 1
    # Each chunk dict has the canonical schema.
    assert all({"tokens", "content", "chunk_order_index"} <= set(c) for c in chunks)
    # chunk_order_index is contiguous starting at 0.
    assert [c["chunk_order_index"] for c in chunks] == list(range(len(chunks)))
    # No empty content rows.
    assert all(c["content"].strip() for c in chunks)


class _ListHandler(logging.Handler):
    def __init__(self) -> None:
        super().__init__()
        self.records: list[logging.LogRecord] = []

    def emit(self, record: logging.LogRecord) -> None:
        self.records.append(record)


@pytest.mark.offline
def test_v_chunker_falls_back_to_recursive_when_no_embedding():
    """When ``embedding_func`` is None, V must log a warning and route
    to chunking_by_recursive_character (R) — V's only differentiator
    is embeddings, so without them R is the closest neighbour."""
    body = "Para A.\n\nPara B for fallback test.\n\nPara C."

    lightrag_logger = logging.getLogger("lightrag")
    handler = _ListHandler()
    handler.setLevel(logging.WARNING)
    lightrag_logger.addHandler(handler)
    try:

        async def _run():
            return await chunking_by_semantic_vector(
                _tok(),
                body,
                chunk_token_size=20,
                embedding_func=None,
            )

        chunks = asyncio.run(_run())
    finally:
        lightrag_logger.removeHandler(handler)

    assert len(chunks) >= 1
    assert any(
        "embedding_func is None" in rec.getMessage()
        for rec in handler.records
        if rec.levelno == logging.WARNING
    )


@pytest.mark.offline
def test_v_chunker_empty_input_returns_empty_list():
    async def _run():
        return await chunking_by_semantic_vector(_tok(), "")

    assert asyncio.run(_run()) == []