token_size.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. """Fixed-size token-window chunking — the LightRAG default strategy.
  2. Chunks the input text into windows of at most ``chunk_token_size`` tokens
  3. with ``chunk_overlap_token_size`` of overlap between adjacent windows.
  4. When ``split_by_character`` is supplied, the splitter first segments on
  5. that delimiter and then either tokenizes each segment as-is
  6. (``split_by_character_only=True``) or further sub-splits any segment
  7. that exceeds the token cap.
  8. Two entry points are exported:
  9. - :func:`chunking_by_token_size` — the **legacy 6-arg signature**
  10. used as the default value for :attr:`lightrag.LightRAG.chunking_func`.
  11. Kept for backward compatibility so externally-supplied chunking
  12. functions can continue to drop in unchanged.
  13. - :func:`chunking_by_fixed_token` — the same algorithm exposed under
  14. the **new file-chunker contract** (standard prefix
  15. ``(tokenizer, content, chunk_token_size)`` plus keyword-only
  16. knobs). Used by the file-based chunking dispatcher in
  17. ``process_single_document`` for ``doc_process_opts.chunking == "F"``.
  18. """
  19. from __future__ import annotations
  20. from typing import Any
  21. from lightrag.exceptions import ChunkTokenLimitExceededError
  22. from lightrag.utils import Tokenizer, logger
  23. def chunking_by_token_size(
  24. tokenizer: Tokenizer,
  25. content: str,
  26. split_by_character: str | None = None,
  27. split_by_character_only: bool = False,
  28. chunk_overlap_token_size: int = 100,
  29. chunk_token_size: int = 1200,
  30. ) -> list[dict[str, Any]]:
  31. """Legacy 6-arg fixed-token chunker (default for ``LightRAG.chunking_func``).
  32. Signature is preserved for backward compatibility with externally
  33. supplied ``chunking_func`` implementations. New file-based chunking
  34. dispatch uses :func:`chunking_by_fixed_token` instead.
  35. """
  36. tokens = tokenizer.encode(content)
  37. results: list[dict[str, Any]] = []
  38. if split_by_character:
  39. raw_chunks = content.split(split_by_character)
  40. new_chunks = []
  41. if split_by_character_only:
  42. for chunk in raw_chunks:
  43. _tokens = tokenizer.encode(chunk)
  44. if len(_tokens) > chunk_token_size:
  45. logger.warning(
  46. "Chunk split_by_character exceeds token limit: len=%d limit=%d",
  47. len(_tokens),
  48. chunk_token_size,
  49. )
  50. raise ChunkTokenLimitExceededError(
  51. chunk_tokens=len(_tokens),
  52. chunk_token_limit=chunk_token_size,
  53. chunk_preview=chunk[:120],
  54. )
  55. new_chunks.append((len(_tokens), chunk))
  56. else:
  57. for chunk in raw_chunks:
  58. _tokens = tokenizer.encode(chunk)
  59. if len(_tokens) > chunk_token_size:
  60. for start in range(
  61. 0, len(_tokens), chunk_token_size - chunk_overlap_token_size
  62. ):
  63. chunk_content = tokenizer.decode(
  64. _tokens[start : start + chunk_token_size]
  65. )
  66. new_chunks.append(
  67. (min(chunk_token_size, len(_tokens) - start), chunk_content)
  68. )
  69. else:
  70. new_chunks.append((len(_tokens), chunk))
  71. for index, (_len, chunk) in enumerate(new_chunks):
  72. results.append(
  73. {
  74. "tokens": _len,
  75. "content": chunk.strip(),
  76. "chunk_order_index": index,
  77. }
  78. )
  79. else:
  80. for index, start in enumerate(
  81. range(0, len(tokens), chunk_token_size - chunk_overlap_token_size)
  82. ):
  83. chunk_content = tokenizer.decode(tokens[start : start + chunk_token_size])
  84. results.append(
  85. {
  86. "tokens": min(chunk_token_size, len(tokens) - start),
  87. "content": chunk_content.strip(),
  88. "chunk_order_index": index,
  89. }
  90. )
  91. return results
  92. def chunking_by_fixed_token(
  93. tokenizer: Tokenizer,
  94. content: str,
  95. chunk_token_size: int = 1200,
  96. *,
  97. chunk_overlap_token_size: int = 100,
  98. split_by_character: str | None = None,
  99. split_by_character_only: bool = False,
  100. ) -> list[dict[str, Any]]:
  101. """Fixed-token chunker — file-chunker contract for the ``"F"`` strategy.
  102. Implements the same fixed-window algorithm as
  103. :func:`chunking_by_token_size`, exposed under the standard
  104. file-chunker signature ``(tokenizer, content, chunk_token_size, *,
  105. <strategy kwargs>)`` so the file-based chunking dispatcher in
  106. ``process_single_document`` can call every strategy uniformly.
  107. """
  108. return chunking_by_token_size(
  109. tokenizer,
  110. content,
  111. split_by_character=split_by_character,
  112. split_by_character_only=split_by_character_only,
  113. chunk_overlap_token_size=chunk_overlap_token_size,
  114. chunk_token_size=chunk_token_size,
  115. )