"""Fixed-size token-window chunking — the LightRAG default strategy. Chunks the input text into windows of at most ``chunk_token_size`` tokens with ``chunk_overlap_token_size`` of overlap between adjacent windows. When ``split_by_character`` is supplied, the splitter first segments on that delimiter and then either tokenizes each segment as-is (``split_by_character_only=True``) or further sub-splits any segment that exceeds the token cap. Two entry points are exported: - :func:`chunking_by_token_size` — the **legacy 6-arg signature** used as the default value for :attr:`lightrag.LightRAG.chunking_func`. Kept for backward compatibility so externally-supplied chunking functions can continue to drop in unchanged. - :func:`chunking_by_fixed_token` — the same algorithm exposed under the **new file-chunker contract** (standard prefix ``(tokenizer, content, chunk_token_size)`` plus keyword-only knobs). Used by the file-based chunking dispatcher in ``process_single_document`` for ``doc_process_opts.chunking == "F"``. """ from __future__ import annotations from typing import Any from lightrag.exceptions import ChunkTokenLimitExceededError from lightrag.utils import Tokenizer, logger def chunking_by_token_size( tokenizer: Tokenizer, content: str, split_by_character: str | None = None, split_by_character_only: bool = False, chunk_overlap_token_size: int = 100, chunk_token_size: int = 1200, ) -> list[dict[str, Any]]: """Legacy 6-arg fixed-token chunker (default for ``LightRAG.chunking_func``). Signature is preserved for backward compatibility with externally supplied ``chunking_func`` implementations. New file-based chunking dispatch uses :func:`chunking_by_fixed_token` instead. """ tokens = tokenizer.encode(content) results: list[dict[str, Any]] = [] if split_by_character: raw_chunks = content.split(split_by_character) new_chunks = [] if split_by_character_only: for chunk in raw_chunks: _tokens = tokenizer.encode(chunk) if len(_tokens) > chunk_token_size: logger.warning( "Chunk split_by_character exceeds token limit: len=%d limit=%d", len(_tokens), chunk_token_size, ) raise ChunkTokenLimitExceededError( chunk_tokens=len(_tokens), chunk_token_limit=chunk_token_size, chunk_preview=chunk[:120], ) new_chunks.append((len(_tokens), chunk)) else: for chunk in raw_chunks: _tokens = tokenizer.encode(chunk) if len(_tokens) > chunk_token_size: for start in range( 0, len(_tokens), chunk_token_size - chunk_overlap_token_size ): chunk_content = tokenizer.decode( _tokens[start : start + chunk_token_size] ) new_chunks.append( (min(chunk_token_size, len(_tokens) - start), chunk_content) ) else: new_chunks.append((len(_tokens), chunk)) for index, (_len, chunk) in enumerate(new_chunks): results.append( { "tokens": _len, "content": chunk.strip(), "chunk_order_index": index, } ) else: for index, start in enumerate( range(0, len(tokens), chunk_token_size - chunk_overlap_token_size) ): chunk_content = tokenizer.decode(tokens[start : start + chunk_token_size]) results.append( { "tokens": min(chunk_token_size, len(tokens) - start), "content": chunk_content.strip(), "chunk_order_index": index, } ) return results def chunking_by_fixed_token( tokenizer: Tokenizer, content: str, chunk_token_size: int = 1200, *, chunk_overlap_token_size: int = 100, split_by_character: str | None = None, split_by_character_only: bool = False, ) -> list[dict[str, Any]]: """Fixed-token chunker — file-chunker contract for the ``"F"`` strategy. Implements the same fixed-window algorithm as :func:`chunking_by_token_size`, exposed under the standard file-chunker signature ``(tokenizer, content, chunk_token_size, *, )`` so the file-based chunking dispatcher in ``process_single_document`` can call every strategy uniformly. """ return chunking_by_token_size( tokenizer, content, split_by_character=split_by_character, split_by_character_only=split_by_character_only, chunk_overlap_token_size=chunk_overlap_token_size, chunk_token_size=chunk_token_size, )