| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128 |
- """Fixed-size token-window chunking — the LightRAG default strategy.
- Chunks the input text into windows of at most ``chunk_token_size`` tokens
- with ``chunk_overlap_token_size`` of overlap between adjacent windows.
- When ``split_by_character`` is supplied, the splitter first segments on
- that delimiter and then either tokenizes each segment as-is
- (``split_by_character_only=True``) or further sub-splits any segment
- that exceeds the token cap.
- Two entry points are exported:
- - :func:`chunking_by_token_size` — the **legacy 6-arg signature**
- used as the default value for :attr:`lightrag.LightRAG.chunking_func`.
- Kept for backward compatibility so externally-supplied chunking
- functions can continue to drop in unchanged.
- - :func:`chunking_by_fixed_token` — the same algorithm exposed under
- the **new file-chunker contract** (standard prefix
- ``(tokenizer, content, chunk_token_size)`` plus keyword-only
- knobs). Used by the file-based chunking dispatcher in
- ``process_single_document`` for ``doc_process_opts.chunking == "F"``.
- """
- from __future__ import annotations
- from typing import Any
- from lightrag.exceptions import ChunkTokenLimitExceededError
- from lightrag.utils import Tokenizer, logger
- def chunking_by_token_size(
- tokenizer: Tokenizer,
- content: str,
- split_by_character: str | None = None,
- split_by_character_only: bool = False,
- chunk_overlap_token_size: int = 100,
- chunk_token_size: int = 1200,
- ) -> list[dict[str, Any]]:
- """Legacy 6-arg fixed-token chunker (default for ``LightRAG.chunking_func``).
- Signature is preserved for backward compatibility with externally
- supplied ``chunking_func`` implementations. New file-based chunking
- dispatch uses :func:`chunking_by_fixed_token` instead.
- """
- tokens = tokenizer.encode(content)
- results: list[dict[str, Any]] = []
- if split_by_character:
- raw_chunks = content.split(split_by_character)
- new_chunks = []
- if split_by_character_only:
- for chunk in raw_chunks:
- _tokens = tokenizer.encode(chunk)
- if len(_tokens) > chunk_token_size:
- logger.warning(
- "Chunk split_by_character exceeds token limit: len=%d limit=%d",
- len(_tokens),
- chunk_token_size,
- )
- raise ChunkTokenLimitExceededError(
- chunk_tokens=len(_tokens),
- chunk_token_limit=chunk_token_size,
- chunk_preview=chunk[:120],
- )
- new_chunks.append((len(_tokens), chunk))
- else:
- for chunk in raw_chunks:
- _tokens = tokenizer.encode(chunk)
- if len(_tokens) > chunk_token_size:
- for start in range(
- 0, len(_tokens), chunk_token_size - chunk_overlap_token_size
- ):
- chunk_content = tokenizer.decode(
- _tokens[start : start + chunk_token_size]
- )
- new_chunks.append(
- (min(chunk_token_size, len(_tokens) - start), chunk_content)
- )
- else:
- new_chunks.append((len(_tokens), chunk))
- for index, (_len, chunk) in enumerate(new_chunks):
- results.append(
- {
- "tokens": _len,
- "content": chunk.strip(),
- "chunk_order_index": index,
- }
- )
- else:
- for index, start in enumerate(
- range(0, len(tokens), chunk_token_size - chunk_overlap_token_size)
- ):
- chunk_content = tokenizer.decode(tokens[start : start + chunk_token_size])
- results.append(
- {
- "tokens": min(chunk_token_size, len(tokens) - start),
- "content": chunk_content.strip(),
- "chunk_order_index": index,
- }
- )
- return results
- def chunking_by_fixed_token(
- tokenizer: Tokenizer,
- content: str,
- chunk_token_size: int = 1200,
- *,
- chunk_overlap_token_size: int = 100,
- split_by_character: str | None = None,
- split_by_character_only: bool = False,
- ) -> list[dict[str, Any]]:
- """Fixed-token chunker — file-chunker contract for the ``"F"`` strategy.
- Implements the same fixed-window algorithm as
- :func:`chunking_by_token_size`, exposed under the standard
- file-chunker signature ``(tokenizer, content, chunk_token_size, *,
- <strategy kwargs>)`` so the file-based chunking dispatcher in
- ``process_single_document`` can call every strategy uniformly.
- """
- return chunking_by_token_size(
- tokenizer,
- content,
- split_by_character=split_by_character,
- split_by_character_only=split_by_character_only,
- chunk_overlap_token_size=chunk_overlap_token_size,
- chunk_token_size=chunk_token_size,
- )
|