recursive_character.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. """Recursive character chunking — the ``"R"`` strategy.
  2. Wraps LangChain's :class:`RecursiveCharacterTextSplitter` and delivers
  3. output rows in the LightRAG file-chunker schema. The splitter walks the
  4. ``separators`` list from longest semantic boundary (``\\n\\n`` by default)
  5. to weakest (the empty string), recursively re-splitting any segment that
  6. still exceeds the token cap.
  7. Token accounting goes through the LightRAG :class:`Tokenizer` via the
  8. ``length_function`` plug-in — without that, ``chunk_size`` would be
  9. measured in characters and ``chunk_token_size`` would lose its meaning.
  10. Output cap is *not* enforced internally: oversized segments are produced
  11. when no separator can break them, and
  12. :func:`lightrag.utils.enforce_chunk_token_limit_before_embedding` does the
  13. final hard split before embedding.
  14. """
  15. from __future__ import annotations
  16. from typing import Any
  17. from lightrag.utils import Tokenizer, logger
  18. try:
  19. from langchain_text_splitters import RecursiveCharacterTextSplitter
  20. _LANGCHAIN_TEXT_SPLITTERS_AVAILABLE = True
  21. except ImportError:
  22. _LANGCHAIN_TEXT_SPLITTERS_AVAILABLE = False
  23. RecursiveCharacterTextSplitter = None # type: ignore[assignment]
  24. def chunking_by_recursive_character(
  25. tokenizer: Tokenizer,
  26. content: str,
  27. chunk_token_size: int = 1200,
  28. *,
  29. chunk_overlap_token_size: int = 100,
  30. separators: list[str] | None = None,
  31. ) -> list[dict[str, Any]]:
  32. """Recursive character splitter — the ``"R"`` chunking strategy.
  33. Args:
  34. tokenizer: LightRAG tokenizer; used as the length function so
  35. ``chunk_token_size`` and ``chunk_overlap_token_size`` are
  36. interpreted in tokens, not characters.
  37. content: Text to split.
  38. chunk_token_size: Hard target size for each chunk (tokens).
  39. chunk_overlap_token_size: Token overlap between adjacent chunks.
  40. separators: Cascade of split candidates. ``None`` defers to
  41. LangChain's defaults: ``["\\n\\n", "\\n", " ", ""]``.
  42. Returns:
  43. Ordered list of ``{"tokens", "content", "chunk_order_index"}``
  44. dicts.
  45. """
  46. if not _LANGCHAIN_TEXT_SPLITTERS_AVAILABLE:
  47. raise ImportError(
  48. "langchain-text-splitters is required for the 'R' chunking "
  49. "strategy; install with `pip install langchain-text-splitters>=0.3`."
  50. )
  51. if not content or not content.strip():
  52. return []
  53. splitter_kwargs: dict[str, Any] = {
  54. "chunk_size": max(int(chunk_token_size), 1),
  55. "chunk_overlap": max(int(chunk_overlap_token_size), 0),
  56. "length_function": lambda s: len(tokenizer.encode(s)),
  57. }
  58. if separators is not None:
  59. splitter_kwargs["separators"] = list(separators)
  60. splitter = RecursiveCharacterTextSplitter(**splitter_kwargs)
  61. pieces = splitter.split_text(content)
  62. results: list[dict[str, Any]] = []
  63. for piece in pieces:
  64. body = piece.strip()
  65. if not body:
  66. continue
  67. results.append(
  68. {
  69. "tokens": len(tokenizer.encode(body)),
  70. "content": body,
  71. "chunk_order_index": len(results),
  72. }
  73. )
  74. if not results:
  75. # Defensive: splitter returned only whitespace fragments. Fall
  76. # through with a single chunk of stripped content so downstream
  77. # callers always receive at least one row when input is non-empty.
  78. logger.warning(
  79. "[recursive_character] splitter produced no non-empty chunks "
  80. "for %d-char input; emitting single fallback chunk.",
  81. len(content),
  82. )
  83. body = content.strip()
  84. if body:
  85. results.append(
  86. {
  87. "tokens": len(tokenizer.encode(body)),
  88. "content": body,
  89. "chunk_order_index": 0,
  90. }
  91. )
  92. return results