test_chunker_semantic_vector.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. """Unit tests for ``chunking_by_semantic_vector`` (process_options=V)."""
  2. import asyncio
  3. import logging
  4. import numpy as np
  5. import pytest
  6. pytest.importorskip("langchain_experimental")
  7. from lightrag.chunker import chunking_by_semantic_vector # noqa: E402
  8. from lightrag.utils import EmbeddingFunc, Tokenizer, TokenizerInterface # noqa: E402
  9. class _CharTokenizer(TokenizerInterface):
  10. """1 char ≈ 1 token."""
  11. def encode(self, content: str):
  12. return [ord(ch) for ch in content]
  13. def decode(self, tokens):
  14. return "".join(chr(t) for t in tokens)
  15. def _tok() -> Tokenizer:
  16. return Tokenizer("char-tokenizer", _CharTokenizer())
  17. def _make_deterministic_embedding(dim: int = 8) -> EmbeddingFunc:
  18. """A toy async embedding func that hashes each input text into a
  19. stable unit vector — enough to drive SemanticChunker without needing
  20. a real model."""
  21. async def _embed(texts, **kwargs):
  22. rng = np.random.default_rng(seed=0)
  23. # Use a simple hash → seeded rng to get reproducible vectors per text.
  24. rows = []
  25. for text in texts:
  26. seed = abs(hash(text)) % (2**32)
  27. rng = np.random.default_rng(seed=seed)
  28. vec = rng.normal(size=dim).astype(np.float32)
  29. vec /= np.linalg.norm(vec) or 1.0
  30. rows.append(vec)
  31. return np.vstack(rows)
  32. return EmbeddingFunc(embedding_dim=dim, max_token_size=4096, func=_embed)
  33. @pytest.mark.offline
  34. def test_v_chunker_runs_with_stub_embedding():
  35. """Async chunker should split a multi-sentence body into ≥1 chunk
  36. when given a working embedding func."""
  37. body = (
  38. "Quantum mechanics describes nature at small scales. "
  39. "It contradicts classical intuition. "
  40. "Bread is baked from flour. "
  41. "Sourdough requires a long fermentation. "
  42. )
  43. async def _run():
  44. chunks = await chunking_by_semantic_vector(
  45. _tok(),
  46. body,
  47. chunk_token_size=200,
  48. embedding_func=_make_deterministic_embedding(),
  49. )
  50. return chunks
  51. chunks = asyncio.run(_run())
  52. assert len(chunks) >= 1
  53. # Each chunk dict has the canonical schema.
  54. assert all({"tokens", "content", "chunk_order_index"} <= set(c) for c in chunks)
  55. # chunk_order_index is contiguous starting at 0.
  56. assert [c["chunk_order_index"] for c in chunks] == list(range(len(chunks)))
  57. # No empty content rows.
  58. assert all(c["content"].strip() for c in chunks)
  59. class _ListHandler(logging.Handler):
  60. def __init__(self) -> None:
  61. super().__init__()
  62. self.records: list[logging.LogRecord] = []
  63. def emit(self, record: logging.LogRecord) -> None:
  64. self.records.append(record)
  65. @pytest.mark.offline
  66. def test_v_chunker_falls_back_to_recursive_when_no_embedding():
  67. """When ``embedding_func`` is None, V must log a warning and route
  68. to chunking_by_recursive_character (R) — V's only differentiator
  69. is embeddings, so without them R is the closest neighbour."""
  70. body = "Para A.\n\nPara B for fallback test.\n\nPara C."
  71. lightrag_logger = logging.getLogger("lightrag")
  72. handler = _ListHandler()
  73. handler.setLevel(logging.WARNING)
  74. lightrag_logger.addHandler(handler)
  75. try:
  76. async def _run():
  77. return await chunking_by_semantic_vector(
  78. _tok(),
  79. body,
  80. chunk_token_size=20,
  81. embedding_func=None,
  82. )
  83. chunks = asyncio.run(_run())
  84. finally:
  85. lightrag_logger.removeHandler(handler)
  86. assert len(chunks) >= 1
  87. assert any(
  88. "embedding_func is None" in rec.getMessage()
  89. for rec in handler.records
  90. if rec.levelno == logging.WARNING
  91. )
  92. @pytest.mark.offline
  93. def test_v_chunker_empty_input_returns_empty_list():
  94. async def _run():
  95. return await chunking_by_semantic_vector(_tok(), "")
  96. assert asyncio.run(_run()) == []