"""Tests for the `/documents/text(s)` ``chunking`` request object. Three concerns: 1. **Synchronous validation**: malformed ``chunking`` is rejected at request-parse time (HTTP 422 / ``ValidationError``) — never deferred to the background indexing task, where the HTTP response is already sent. The per-strategy typed params models do full type + value checking, not just unknown-key detection. 2. **``_resolve_text_chunking``**: a validated ``chunking`` config is frozen into ``(process_options, chunk_options)``; ``chunk_token_size`` and the strategy params land in the selected strategy's sub-dict, overriding any env-derived value, while the other strategy sub-dicts are dropped (slim). 3. **Route forwarding**: ``/documents/text`` and ``/documents/texts`` forward ``request.chunking`` to ``pipeline_index_texts`` and return 422 (without scheduling any background work) for a malformed body. """ import importlib import sys from types import SimpleNamespace import pytest from fastapi import FastAPI from fastapi.testclient import TestClient from pydantic import ValidationError _original_argv = sys.argv[:] sys.argv = [sys.argv[0]] _dr = importlib.import_module("lightrag.api.routers.document_routes") sys.argv = _original_argv TextChunkingConfig = _dr.TextChunkingConfig InsertTextRequest = _dr.InsertTextRequest _resolve_text_chunking = _dr._resolve_text_chunking create_document_routes = _dr.create_document_routes from lightrag.constants import ( # noqa: E402 PROCESS_OPTION_CHUNK_FIXED, PROCESS_OPTION_CHUNK_PARAGRAH, PROCESS_OPTION_CHUNK_RECURSIVE, PROCESS_OPTION_CHUNK_VECTOR, ) from lightrag.parser.routing import default_chunker_config # noqa: E402 pytestmark = pytest.mark.offline _ALL_STRATEGY_KEYS = { "fixed_token", "recursive_character", "semantic_vector", "paragraph_semantic", } # --------------------------------------------------------------------------- # 1. Synchronous validation # --------------------------------------------------------------------------- @pytest.mark.parametrize( "body", [ # wrong types (strict rejects lax coercion) {"strategy": "fixed_token", "params": {"chunk_token_size": True}}, {"strategy": "fixed_token", "params": {"chunk_token_size": "5"}}, {"strategy": "fixed_token", "params": {"chunk_token_size": 1.5}}, {"strategy": "fixed_token", "params": {"chunk_overlap_token_size": "bad"}}, {"strategy": "fixed_token", "params": {"split_by_character": 123}}, {"strategy": "fixed_token", "params": {"split_by_character_only": 1}}, {"strategy": "recursive_character", "params": {"separators": "abc"}}, {"strategy": "recursive_character", "params": {"separators": [1, 2]}}, # value / range {"strategy": "fixed_token", "params": {"chunk_token_size": 0}}, {"strategy": "recursive_character", "params": {"chunk_overlap_token_size": -1}}, {"strategy": "semantic_vector", "params": {"buffer_size": 0}}, {"strategy": "semantic_vector", "params": {"buffer_size": True}}, { "strategy": "semantic_vector", "params": {"breakpoint_threshold_type": "p99"}, }, { "strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 0}, }, { # strict float rejects strings (no lax numeric-string coercion) "strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": "95"}, }, { # strict float rejects bool (bool is an int subclass, undesirable here) "strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": True}, }, { # > 100 with an explicit percentile/gradient type is rejected at # parse time (both fields present, no inheritance ambiguity). "strategy": "semantic_vector", "params": { "breakpoint_threshold_type": "percentile", "breakpoint_threshold_amount": 150, }, }, { # malformed regex must be compiled/rejected at parse time "strategy": "semantic_vector", "params": {"sentence_split_regex": "("}, }, # cross-field { "strategy": "fixed_token", "params": {"chunk_token_size": 100, "chunk_overlap_token_size": 200}, }, # unknown / wrong-for-strategy keys {"strategy": "fixed_token", "params": {"bogus": 1}}, {"strategy": "fixed_token", "params": {"separators": ["x"]}}, {"strategy": "recursive_character", "params": {"buffer_size": 1}}, ], ) def test_chunking_config_rejects_malformed(body): with pytest.raises(ValidationError): TextChunkingConfig.model_validate(body) def test_chunking_config_defaults_to_fixed_token(): cfg = TextChunkingConfig.model_validate({"params": {"chunk_token_size": 500}}) assert cfg.strategy == "fixed_token" assert cfg.params == {"chunk_token_size": 500} def test_chunking_config_normalizes_to_supplied_keys_only(): # int amount is coerced to float; only the supplied key survives. cfg = TextChunkingConfig.model_validate( {"strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 95}} ) assert cfg.params == {"breakpoint_threshold_amount": 95.0} def test_chunking_config_amount_in_range_for_std_deviation(): # standard_deviation only requires > 0 (no [0, 100] ceiling). cfg = TextChunkingConfig.model_validate( { "strategy": "semantic_vector", "params": { "breakpoint_threshold_type": "standard_deviation", "breakpoint_threshold_amount": 3.5, }, } ) assert cfg.params["breakpoint_threshold_amount"] == 3.5 def test_chunking_config_amount_over_100_without_type_is_deferred(): # Type omitted -> the (0, 100] ceiling cannot be decided at parse time # (the effective type may be inherited), so the model must NOT assume # percentile and reject. _resolve_text_chunking applies the ceiling later. cfg = TextChunkingConfig.model_validate( {"strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 150}} ) assert cfg.params == {"breakpoint_threshold_amount": 150.0} def test_chunking_config_accepts_int_amount_widened_to_float(): # Strict float accepts an int (JSON 95) and widens it to 95.0 — the common # documented threshold magnitude. (str/bool are rejected; see the # rejection matrix above.) Exercised via both python and JSON validation # modes so the FastAPI request path (which parses JSON) stays covered. cfg = TextChunkingConfig.model_validate( {"strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 95}} ) assert cfg.params == {"breakpoint_threshold_amount": 95.0} assert isinstance(cfg.params["breakpoint_threshold_amount"], float) cfg_json = TextChunkingConfig.model_validate_json( '{"strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 95}}' ) assert cfg_json.params == {"breakpoint_threshold_amount": 95.0} def test_chunking_config_accepts_valid_sentence_split_regex(): cfg = TextChunkingConfig.model_validate( { "strategy": "semantic_vector", "params": {"sentence_split_regex": r"(?<=[.?!])\s+"}, } ) assert cfg.params == {"sentence_split_regex": r"(?<=[.?!])\s+"} def test_chunking_config_drops_explicit_null(): # Explicit null means "inherit the default" (every param field is # Optional/None=inherit), so it must be dropped — not merged over the # resolved default, which would later make the chunker do int(None). cfg = TextChunkingConfig.model_validate( {"strategy": "fixed_token", "params": {"chunk_token_size": None}} ) assert cfg.params == {} def test_chunking_config_keeps_real_value_drops_sibling_null(): cfg = TextChunkingConfig.model_validate( { "strategy": "fixed_token", "params": {"chunk_token_size": 500, "split_by_character": None}, } ) assert cfg.params == {"chunk_token_size": 500} def test_insert_text_request_rejects_malformed_chunking(): with pytest.raises(ValidationError): InsertTextRequest.model_validate( { "text": "hi", "file_source": "a.md", "chunking": { "strategy": "recursive_character", "params": {"separators": "notalist"}, }, } ) # --------------------------------------------------------------------------- # 2. _resolve_text_chunking # --------------------------------------------------------------------------- def _stub_rag(addon_params=None): return SimpleNamespace( addon_params=addon_params if addon_params is not None else {} ) def test_resolve_none_keeps_default_fixed(): process_options, chunk_options = _resolve_text_chunking(None, _stub_rag()) assert process_options == PROCESS_OPTION_CHUNK_FIXED assert "fixed_token" in chunk_options @pytest.mark.parametrize( "strategy,expected_po,key", [ ("fixed_token", PROCESS_OPTION_CHUNK_FIXED, "fixed_token"), ("recursive_character", PROCESS_OPTION_CHUNK_RECURSIVE, "recursive_character"), ("semantic_vector", PROCESS_OPTION_CHUNK_VECTOR, "semantic_vector"), ("paragraph_semantic", PROCESS_OPTION_CHUNK_PARAGRAH, "paragraph_semantic"), ], ) def test_resolve_maps_strategy_and_writes_size_into_subdict(strategy, expected_po, key): cfg = TextChunkingConfig.model_validate( {"strategy": strategy, "params": {"chunk_token_size": 777}} ) process_options, chunk_options = _resolve_text_chunking(cfg, _stub_rag()) assert process_options == expected_po # chunk_token_size lands in the strategy sub-dict for ALL strategies # (F included, post-cleanup) — that's where process_single_document reads it. assert chunk_options[key]["chunk_token_size"] == 777 # slim contract: other strategies' sub-dicts are dropped for other in _ALL_STRATEGY_KEYS - {key}: assert other not in chunk_options def test_resolve_merges_strategy_params(): cfg = TextChunkingConfig.model_validate( { "strategy": "recursive_character", "params": {"separators": ["A", "B"], "chunk_overlap_token_size": 0}, } ) _, chunk_options = _resolve_text_chunking(cfg, _stub_rag()) assert chunk_options["recursive_character"]["separators"] == ["A", "B"] assert chunk_options["recursive_character"]["chunk_overlap_token_size"] == 0 def test_resolve_size_overrides_env_for_recursive(monkeypatch): monkeypatch.setenv("CHUNK_R_SIZE", "999") addon = {"chunker": default_chunker_config()} # sanity: env baked into the R sub-dict assert addon["chunker"]["recursive_character"]["chunk_token_size"] == 999 cfg = TextChunkingConfig.model_validate( {"strategy": "recursive_character", "params": {"chunk_token_size": 1234}} ) _, chunk_options = _resolve_text_chunking(cfg, _stub_rag(addon)) # API value wins over the env-derived sub-dict value. assert chunk_options["recursive_character"]["chunk_token_size"] == 1234 def test_resolve_split_by_character_only_false_overrides_env(monkeypatch): # The API path can express an explicit False (a plain dict merge), unlike # the ainsert positional-arg path. Prove it overrides an env-True default. monkeypatch.setenv("CHUNK_F_SPLIT_BY_CHARACTER_ONLY", "true") addon = {"chunker": default_chunker_config()} assert addon["chunker"]["fixed_token"]["split_by_character_only"] is True cfg = TextChunkingConfig.model_validate( {"strategy": "fixed_token", "params": {"split_by_character_only": False}} ) _, chunk_options = _resolve_text_chunking(cfg, _stub_rag(addon)) assert chunk_options["fixed_token"]["split_by_character_only"] is False def test_resolve_rejects_size_below_inherited_overlap(monkeypatch): # Overlap is inherited from addon_params (not in the request), so the # request model can't catch it — _resolve_text_chunking must. monkeypatch.setenv("CHUNK_F_OVERLAP_SIZE", "100") addon = {"chunker": default_chunker_config()} assert addon["chunker"]["fixed_token"]["chunk_overlap_token_size"] == 100 cfg = TextChunkingConfig.model_validate( {"strategy": "fixed_token", "params": {"chunk_token_size": 50}} ) with pytest.raises(ValueError, match="chunk_overlap_token_size"): _resolve_text_chunking(cfg, _stub_rag(addon)) def test_resolve_allows_size_above_inherited_overlap(monkeypatch): monkeypatch.setenv("CHUNK_F_OVERLAP_SIZE", "100") addon = {"chunker": default_chunker_config()} cfg = TextChunkingConfig.model_validate( {"strategy": "fixed_token", "params": {"chunk_token_size": 400}} ) _, chunk_options = _resolve_text_chunking(cfg, _stub_rag(addon)) assert chunk_options["fixed_token"]["chunk_token_size"] == 400 def test_resolve_skips_overlap_check_for_delimiter_only(monkeypatch): # Delimiter-only fixed-token chunking never uses overlap, so a small # chunk_token_size below the inherited overlap must NOT be rejected. monkeypatch.setenv("CHUNK_F_OVERLAP_SIZE", "100") addon = {"chunker": default_chunker_config()} cfg = TextChunkingConfig.model_validate( { "strategy": "fixed_token", "params": { "split_by_character": "\n\n", "split_by_character_only": True, "chunk_token_size": 50, }, } ) _, chunk_options = _resolve_text_chunking(cfg, _stub_rag(addon)) assert chunk_options["fixed_token"]["chunk_token_size"] == 50 def test_resolve_enforces_overlap_when_only_flag_without_delimiter(monkeypatch): # split_by_character_only is a no-op without split_by_character: the chunker # falls back to normal token windowing, which DOES use overlap — so the # overlap < size check must still fire here. monkeypatch.setenv("CHUNK_F_OVERLAP_SIZE", "100") monkeypatch.delenv("CHUNK_F_SPLIT_BY_CHARACTER", raising=False) addon = {"chunker": default_chunker_config()} cfg = TextChunkingConfig.model_validate( { "strategy": "fixed_token", "params": {"split_by_character_only": True, "chunk_token_size": 50}, } ) with pytest.raises(ValueError, match="chunk_overlap_token_size"): _resolve_text_chunking(cfg, _stub_rag(addon)) def test_resolve_allows_amount_over_100_with_inherited_std_type(): # Request overrides only the amount; the standard_deviation type is # inherited from addon_params. std/iqr have no (0, 100] ceiling, so this # must NOT be rejected (the request model deferred the check here). addon = { "chunker": { "semantic_vector": {"breakpoint_threshold_type": "standard_deviation"} } } cfg = TextChunkingConfig.model_validate( {"strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 150}} ) _, chunk_options = _resolve_text_chunking(cfg, _stub_rag(addon)) assert chunk_options["semantic_vector"]["breakpoint_threshold_amount"] == 150 assert ( chunk_options["semantic_vector"]["breakpoint_threshold_type"] == "standard_deviation" ) def test_resolve_rejects_amount_over_100_with_inherited_percentile_type(): # Same partial override, but the effective (inherited) type is percentile, # which feeds np.percentile and requires the (0, 100] ceiling. addon = { "chunker": {"semantic_vector": {"breakpoint_threshold_type": "percentile"}} } cfg = TextChunkingConfig.model_validate( {"strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 150}} ) with pytest.raises(ValueError, match="breakpoint_threshold_amount"): _resolve_text_chunking(cfg, _stub_rag(addon)) def test_resolve_null_size_does_not_erase_inherited_default(monkeypatch): # An explicit null in the request must not overwrite the resolved size # with None (which would make the chunker do int(None) in the background). monkeypatch.setenv("CHUNK_F_SIZE", "640") addon = {"chunker": default_chunker_config()} cfg = TextChunkingConfig.model_validate( {"strategy": "fixed_token", "params": {"chunk_token_size": None}} ) _, chunk_options = _resolve_text_chunking(cfg, _stub_rag(addon)) # null dropped by the model -> inherited CHUNK_F_SIZE survives, no None. assert chunk_options["fixed_token"]["chunk_token_size"] == 640 # --------------------------------------------------------------------------- # 3. Route forwarding + synchronous 422 # --------------------------------------------------------------------------- class _FwdDocStatus: async def get_doc_by_file_basename(self, basename): return None class _FwdRag: workspace = "chunk-fwd-test" addon_params: dict = {} def __init__(self): self.doc_status = _FwdDocStatus() _HEADERS = {"X-API-Key": "test-key"} def _make_client(monkeypatch, addon_params=None): """Build a TestClient whose enqueue-slot guards are no-ops and whose ``pipeline_index_texts`` is a spy recording the forwarded args. ``addon_params`` seeds the rag the routes resolve chunking against; the handler calls the real ``_resolve_text_chunking`` synchronously, so the effective-overlap validation runs against this snapshot. """ captured: dict = {} async def _spy(rag, texts, file_sources=None, track_id=None, chunking=None): captured["texts"] = texts captured["file_sources"] = file_sources captured["chunking"] = chunking async def _noop_reserve(rag): return False async def _noop_release(rag): return None monkeypatch.setattr(_dr, "pipeline_index_texts", _spy) monkeypatch.setattr(_dr, "_reserve_enqueue_slot", _noop_reserve) monkeypatch.setattr(_dr, "_release_enqueue_slot", _noop_release) rag = _FwdRag() rag.addon_params = addon_params if addon_params is not None else {} app = FastAPI() app.include_router( create_document_routes(rag, SimpleNamespace(), api_key="test-key") ) return TestClient(app), captured def test_insert_text_forwards_chunking(monkeypatch): client, captured = _make_client(monkeypatch) resp = client.post( "/documents/text", headers=_HEADERS, json={ "text": "hello world", "file_source": "a.md", "chunking": { "strategy": "recursive_character", "params": {"chunk_token_size": 1000, "separators": ["X"]}, }, }, ) assert resp.status_code == 200 assert captured["chunking"] is not None assert captured["chunking"].strategy == "recursive_character" assert captured["chunking"].params == { "chunk_token_size": 1000, "separators": ["X"], } def test_insert_texts_forwards_chunking(monkeypatch): client, captured = _make_client(monkeypatch) resp = client.post( "/documents/texts", headers=_HEADERS, json={ "texts": ["one", "two"], "file_sources": ["a.md", "b.md"], "chunking": {"strategy": "semantic_vector", "params": {"buffer_size": 2}}, }, ) assert resp.status_code == 200 assert captured["chunking"].strategy == "semantic_vector" assert captured["chunking"].params == {"buffer_size": 2} def test_insert_text_without_chunking_forwards_none(monkeypatch): client, captured = _make_client(monkeypatch) resp = client.post( "/documents/text", headers=_HEADERS, json={"text": "hello", "file_source": "a.md"}, ) assert resp.status_code == 200 assert captured["chunking"] is None def test_insert_text_returns_422_on_malformed_chunking_without_scheduling(monkeypatch): client, captured = _make_client(monkeypatch) resp = client.post( "/documents/text", headers=_HEADERS, json={ "text": "hello", "file_source": "a.md", "chunking": { "strategy": "recursive_character", "params": {"separators": "notalist"}, }, }, ) assert resp.status_code == 422 # Body validation fails before the endpoint body runs: no background # indexing is scheduled, so the spy never fires. assert captured == {} def test_insert_text_returns_422_when_size_below_inherited_overlap(monkeypatch): # chunk_token_size=50 in the request, overlap=100 inherited from the # rag's addon_params (not in the request). The model can't catch this; # the handler's synchronous _resolve_text_chunking must, BEFORE any # background work is scheduled. addon = { "chunker": { "chunk_token_size": 1200, "fixed_token": {"chunk_overlap_token_size": 100}, } } client, captured = _make_client(monkeypatch, addon_params=addon) resp = client.post( "/documents/text", headers=_HEADERS, json={ "text": "hello", "file_source": "a.md", "chunking": {"strategy": "fixed_token", "params": {"chunk_token_size": 50}}, }, ) assert resp.status_code == 422 assert "chunk_overlap_token_size" in resp.json()["detail"] # Rejected synchronously: background indexing never scheduled. assert captured == {} def test_insert_text_allows_amount_override_inheriting_std_type(monkeypatch): # Reviewer scenario: deployment sets standard_deviation; a request # overrides only breakpoint_threshold_amount (> 100). This must be # accepted (not 422), since std has no (0, 100] ceiling. addon = { "chunker": { "semantic_vector": {"breakpoint_threshold_type": "standard_deviation"} } } client, captured = _make_client(monkeypatch, addon_params=addon) resp = client.post( "/documents/text", headers=_HEADERS, json={ "text": "hello", "file_source": "a.md", "chunking": { "strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 150}, }, }, ) assert resp.status_code == 200 assert captured["chunking"].params == {"breakpoint_threshold_amount": 150.0} def test_insert_text_rejects_amount_over_100_inheriting_percentile_type(monkeypatch): addon = { "chunker": {"semantic_vector": {"breakpoint_threshold_type": "percentile"}} } client, captured = _make_client(monkeypatch, addon_params=addon) resp = client.post( "/documents/text", headers=_HEADERS, json={ "text": "hello", "file_source": "a.md", "chunking": { "strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 150}, }, }, ) assert resp.status_code == 422 assert "breakpoint_threshold_amount" in resp.json()["detail"] assert captured == {} def test_insert_text_rejects_malformed_sentence_split_regex(monkeypatch): # Malformed regex must 422 at request parse time, before scheduling. client, captured = _make_client(monkeypatch) resp = client.post( "/documents/text", headers=_HEADERS, json={ "text": "hello", "file_source": "a.md", "chunking": { "strategy": "semantic_vector", "params": {"sentence_split_regex": "("}, }, }, ) assert resp.status_code == 422 assert captured == {} def test_insert_text_drops_explicit_null_param(monkeypatch): # "chunk_token_size": null must be treated as "inherit" (dropped), so the # request succeeds and the forwarded params carry no None that would later # crash the chunker with int(None). client, captured = _make_client(monkeypatch) resp = client.post( "/documents/text", headers=_HEADERS, json={ "text": "hello", "file_source": "a.md", "chunking": { "strategy": "fixed_token", "params": {"chunk_token_size": None, "chunk_overlap_token_size": 50}, }, }, ) assert resp.status_code == 200 assert captured["chunking"].params == {"chunk_overlap_token_size": 50} def test_insert_text_allows_small_size_for_delimiter_only(monkeypatch): # Paragraph splitting with a small chunk_token_size: overlap is inherited # (100) but unused in delimiter-only mode, so this must succeed, not 422. addon = {"chunker": {"fixed_token": {"chunk_overlap_token_size": 100}}} client, captured = _make_client(monkeypatch, addon_params=addon) resp = client.post( "/documents/text", headers=_HEADERS, json={ "text": "hello", "file_source": "a.md", "chunking": { "strategy": "fixed_token", "params": { "split_by_character": "\n\n", "split_by_character_only": True, "chunk_token_size": 50, }, }, }, ) assert resp.status_code == 200 assert captured["chunking"].params["chunk_token_size"] == 50