test_document_routes_chunking.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673
  1. """Tests for the `/documents/text(s)` ``chunking`` request object.
  2. Three concerns:
  3. 1. **Synchronous validation**: malformed ``chunking`` is rejected at
  4. request-parse time (HTTP 422 / ``ValidationError``) — never deferred to
  5. the background indexing task, where the HTTP response is already sent.
  6. The per-strategy typed params models do full type + value checking, not
  7. just unknown-key detection.
  8. 2. **``_resolve_text_chunking``**: a validated ``chunking`` config is frozen
  9. into ``(process_options, chunk_options)``; ``chunk_token_size`` and the
  10. strategy params land in the selected strategy's sub-dict, overriding any
  11. env-derived value, while the other strategy sub-dicts are dropped (slim).
  12. 3. **Route forwarding**: ``/documents/text`` and ``/documents/texts`` forward
  13. ``request.chunking`` to ``pipeline_index_texts`` and return 422 (without
  14. scheduling any background work) for a malformed body.
  15. """
  16. import importlib
  17. import sys
  18. from types import SimpleNamespace
  19. import pytest
  20. from fastapi import FastAPI
  21. from fastapi.testclient import TestClient
  22. from pydantic import ValidationError
  23. _original_argv = sys.argv[:]
  24. sys.argv = [sys.argv[0]]
  25. _dr = importlib.import_module("lightrag.api.routers.document_routes")
  26. sys.argv = _original_argv
  27. TextChunkingConfig = _dr.TextChunkingConfig
  28. InsertTextRequest = _dr.InsertTextRequest
  29. _resolve_text_chunking = _dr._resolve_text_chunking
  30. create_document_routes = _dr.create_document_routes
  31. from lightrag.constants import ( # noqa: E402
  32. PROCESS_OPTION_CHUNK_FIXED,
  33. PROCESS_OPTION_CHUNK_PARAGRAH,
  34. PROCESS_OPTION_CHUNK_RECURSIVE,
  35. PROCESS_OPTION_CHUNK_VECTOR,
  36. )
  37. from lightrag.parser.routing import default_chunker_config # noqa: E402
  38. pytestmark = pytest.mark.offline
  39. _ALL_STRATEGY_KEYS = {
  40. "fixed_token",
  41. "recursive_character",
  42. "semantic_vector",
  43. "paragraph_semantic",
  44. }
  45. # ---------------------------------------------------------------------------
  46. # 1. Synchronous validation
  47. # ---------------------------------------------------------------------------
  48. @pytest.mark.parametrize(
  49. "body",
  50. [
  51. # wrong types (strict rejects lax coercion)
  52. {"strategy": "fixed_token", "params": {"chunk_token_size": True}},
  53. {"strategy": "fixed_token", "params": {"chunk_token_size": "5"}},
  54. {"strategy": "fixed_token", "params": {"chunk_token_size": 1.5}},
  55. {"strategy": "fixed_token", "params": {"chunk_overlap_token_size": "bad"}},
  56. {"strategy": "fixed_token", "params": {"split_by_character": 123}},
  57. {"strategy": "fixed_token", "params": {"split_by_character_only": 1}},
  58. {"strategy": "recursive_character", "params": {"separators": "abc"}},
  59. {"strategy": "recursive_character", "params": {"separators": [1, 2]}},
  60. # value / range
  61. {"strategy": "fixed_token", "params": {"chunk_token_size": 0}},
  62. {"strategy": "recursive_character", "params": {"chunk_overlap_token_size": -1}},
  63. {"strategy": "semantic_vector", "params": {"buffer_size": 0}},
  64. {"strategy": "semantic_vector", "params": {"buffer_size": True}},
  65. {
  66. "strategy": "semantic_vector",
  67. "params": {"breakpoint_threshold_type": "p99"},
  68. },
  69. {
  70. "strategy": "semantic_vector",
  71. "params": {"breakpoint_threshold_amount": 0},
  72. },
  73. {
  74. # strict float rejects strings (no lax numeric-string coercion)
  75. "strategy": "semantic_vector",
  76. "params": {"breakpoint_threshold_amount": "95"},
  77. },
  78. {
  79. # strict float rejects bool (bool is an int subclass, undesirable here)
  80. "strategy": "semantic_vector",
  81. "params": {"breakpoint_threshold_amount": True},
  82. },
  83. {
  84. # > 100 with an explicit percentile/gradient type is rejected at
  85. # parse time (both fields present, no inheritance ambiguity).
  86. "strategy": "semantic_vector",
  87. "params": {
  88. "breakpoint_threshold_type": "percentile",
  89. "breakpoint_threshold_amount": 150,
  90. },
  91. },
  92. {
  93. # malformed regex must be compiled/rejected at parse time
  94. "strategy": "semantic_vector",
  95. "params": {"sentence_split_regex": "("},
  96. },
  97. # cross-field
  98. {
  99. "strategy": "fixed_token",
  100. "params": {"chunk_token_size": 100, "chunk_overlap_token_size": 200},
  101. },
  102. # unknown / wrong-for-strategy keys
  103. {"strategy": "fixed_token", "params": {"bogus": 1}},
  104. {"strategy": "fixed_token", "params": {"separators": ["x"]}},
  105. {"strategy": "recursive_character", "params": {"buffer_size": 1}},
  106. ],
  107. )
  108. def test_chunking_config_rejects_malformed(body):
  109. with pytest.raises(ValidationError):
  110. TextChunkingConfig.model_validate(body)
  111. def test_chunking_config_defaults_to_fixed_token():
  112. cfg = TextChunkingConfig.model_validate({"params": {"chunk_token_size": 500}})
  113. assert cfg.strategy == "fixed_token"
  114. assert cfg.params == {"chunk_token_size": 500}
  115. def test_chunking_config_normalizes_to_supplied_keys_only():
  116. # int amount is coerced to float; only the supplied key survives.
  117. cfg = TextChunkingConfig.model_validate(
  118. {"strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 95}}
  119. )
  120. assert cfg.params == {"breakpoint_threshold_amount": 95.0}
  121. def test_chunking_config_amount_in_range_for_std_deviation():
  122. # standard_deviation only requires > 0 (no [0, 100] ceiling).
  123. cfg = TextChunkingConfig.model_validate(
  124. {
  125. "strategy": "semantic_vector",
  126. "params": {
  127. "breakpoint_threshold_type": "standard_deviation",
  128. "breakpoint_threshold_amount": 3.5,
  129. },
  130. }
  131. )
  132. assert cfg.params["breakpoint_threshold_amount"] == 3.5
  133. def test_chunking_config_amount_over_100_without_type_is_deferred():
  134. # Type omitted -> the (0, 100] ceiling cannot be decided at parse time
  135. # (the effective type may be inherited), so the model must NOT assume
  136. # percentile and reject. _resolve_text_chunking applies the ceiling later.
  137. cfg = TextChunkingConfig.model_validate(
  138. {"strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 150}}
  139. )
  140. assert cfg.params == {"breakpoint_threshold_amount": 150.0}
  141. def test_chunking_config_accepts_int_amount_widened_to_float():
  142. # Strict float accepts an int (JSON 95) and widens it to 95.0 — the common
  143. # documented threshold magnitude. (str/bool are rejected; see the
  144. # rejection matrix above.) Exercised via both python and JSON validation
  145. # modes so the FastAPI request path (which parses JSON) stays covered.
  146. cfg = TextChunkingConfig.model_validate(
  147. {"strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 95}}
  148. )
  149. assert cfg.params == {"breakpoint_threshold_amount": 95.0}
  150. assert isinstance(cfg.params["breakpoint_threshold_amount"], float)
  151. cfg_json = TextChunkingConfig.model_validate_json(
  152. '{"strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 95}}'
  153. )
  154. assert cfg_json.params == {"breakpoint_threshold_amount": 95.0}
  155. def test_chunking_config_accepts_valid_sentence_split_regex():
  156. cfg = TextChunkingConfig.model_validate(
  157. {
  158. "strategy": "semantic_vector",
  159. "params": {"sentence_split_regex": r"(?<=[.?!])\s+"},
  160. }
  161. )
  162. assert cfg.params == {"sentence_split_regex": r"(?<=[.?!])\s+"}
  163. def test_chunking_config_drops_explicit_null():
  164. # Explicit null means "inherit the default" (every param field is
  165. # Optional/None=inherit), so it must be dropped — not merged over the
  166. # resolved default, which would later make the chunker do int(None).
  167. cfg = TextChunkingConfig.model_validate(
  168. {"strategy": "fixed_token", "params": {"chunk_token_size": None}}
  169. )
  170. assert cfg.params == {}
  171. def test_chunking_config_keeps_real_value_drops_sibling_null():
  172. cfg = TextChunkingConfig.model_validate(
  173. {
  174. "strategy": "fixed_token",
  175. "params": {"chunk_token_size": 500, "split_by_character": None},
  176. }
  177. )
  178. assert cfg.params == {"chunk_token_size": 500}
  179. def test_insert_text_request_rejects_malformed_chunking():
  180. with pytest.raises(ValidationError):
  181. InsertTextRequest.model_validate(
  182. {
  183. "text": "hi",
  184. "file_source": "a.md",
  185. "chunking": {
  186. "strategy": "recursive_character",
  187. "params": {"separators": "notalist"},
  188. },
  189. }
  190. )
  191. # ---------------------------------------------------------------------------
  192. # 2. _resolve_text_chunking
  193. # ---------------------------------------------------------------------------
  194. def _stub_rag(addon_params=None):
  195. return SimpleNamespace(
  196. addon_params=addon_params if addon_params is not None else {}
  197. )
  198. def test_resolve_none_keeps_default_fixed():
  199. process_options, chunk_options = _resolve_text_chunking(None, _stub_rag())
  200. assert process_options == PROCESS_OPTION_CHUNK_FIXED
  201. assert "fixed_token" in chunk_options
  202. @pytest.mark.parametrize(
  203. "strategy,expected_po,key",
  204. [
  205. ("fixed_token", PROCESS_OPTION_CHUNK_FIXED, "fixed_token"),
  206. ("recursive_character", PROCESS_OPTION_CHUNK_RECURSIVE, "recursive_character"),
  207. ("semantic_vector", PROCESS_OPTION_CHUNK_VECTOR, "semantic_vector"),
  208. ("paragraph_semantic", PROCESS_OPTION_CHUNK_PARAGRAH, "paragraph_semantic"),
  209. ],
  210. )
  211. def test_resolve_maps_strategy_and_writes_size_into_subdict(strategy, expected_po, key):
  212. cfg = TextChunkingConfig.model_validate(
  213. {"strategy": strategy, "params": {"chunk_token_size": 777}}
  214. )
  215. process_options, chunk_options = _resolve_text_chunking(cfg, _stub_rag())
  216. assert process_options == expected_po
  217. # chunk_token_size lands in the strategy sub-dict for ALL strategies
  218. # (F included, post-cleanup) — that's where process_single_document reads it.
  219. assert chunk_options[key]["chunk_token_size"] == 777
  220. # slim contract: other strategies' sub-dicts are dropped
  221. for other in _ALL_STRATEGY_KEYS - {key}:
  222. assert other not in chunk_options
  223. def test_resolve_merges_strategy_params():
  224. cfg = TextChunkingConfig.model_validate(
  225. {
  226. "strategy": "recursive_character",
  227. "params": {"separators": ["A", "B"], "chunk_overlap_token_size": 0},
  228. }
  229. )
  230. _, chunk_options = _resolve_text_chunking(cfg, _stub_rag())
  231. assert chunk_options["recursive_character"]["separators"] == ["A", "B"]
  232. assert chunk_options["recursive_character"]["chunk_overlap_token_size"] == 0
  233. def test_resolve_size_overrides_env_for_recursive(monkeypatch):
  234. monkeypatch.setenv("CHUNK_R_SIZE", "999")
  235. addon = {"chunker": default_chunker_config()}
  236. # sanity: env baked into the R sub-dict
  237. assert addon["chunker"]["recursive_character"]["chunk_token_size"] == 999
  238. cfg = TextChunkingConfig.model_validate(
  239. {"strategy": "recursive_character", "params": {"chunk_token_size": 1234}}
  240. )
  241. _, chunk_options = _resolve_text_chunking(cfg, _stub_rag(addon))
  242. # API value wins over the env-derived sub-dict value.
  243. assert chunk_options["recursive_character"]["chunk_token_size"] == 1234
  244. def test_resolve_split_by_character_only_false_overrides_env(monkeypatch):
  245. # The API path can express an explicit False (a plain dict merge), unlike
  246. # the ainsert positional-arg path. Prove it overrides an env-True default.
  247. monkeypatch.setenv("CHUNK_F_SPLIT_BY_CHARACTER_ONLY", "true")
  248. addon = {"chunker": default_chunker_config()}
  249. assert addon["chunker"]["fixed_token"]["split_by_character_only"] is True
  250. cfg = TextChunkingConfig.model_validate(
  251. {"strategy": "fixed_token", "params": {"split_by_character_only": False}}
  252. )
  253. _, chunk_options = _resolve_text_chunking(cfg, _stub_rag(addon))
  254. assert chunk_options["fixed_token"]["split_by_character_only"] is False
  255. def test_resolve_rejects_size_below_inherited_overlap(monkeypatch):
  256. # Overlap is inherited from addon_params (not in the request), so the
  257. # request model can't catch it — _resolve_text_chunking must.
  258. monkeypatch.setenv("CHUNK_F_OVERLAP_SIZE", "100")
  259. addon = {"chunker": default_chunker_config()}
  260. assert addon["chunker"]["fixed_token"]["chunk_overlap_token_size"] == 100
  261. cfg = TextChunkingConfig.model_validate(
  262. {"strategy": "fixed_token", "params": {"chunk_token_size": 50}}
  263. )
  264. with pytest.raises(ValueError, match="chunk_overlap_token_size"):
  265. _resolve_text_chunking(cfg, _stub_rag(addon))
  266. def test_resolve_allows_size_above_inherited_overlap(monkeypatch):
  267. monkeypatch.setenv("CHUNK_F_OVERLAP_SIZE", "100")
  268. addon = {"chunker": default_chunker_config()}
  269. cfg = TextChunkingConfig.model_validate(
  270. {"strategy": "fixed_token", "params": {"chunk_token_size": 400}}
  271. )
  272. _, chunk_options = _resolve_text_chunking(cfg, _stub_rag(addon))
  273. assert chunk_options["fixed_token"]["chunk_token_size"] == 400
  274. def test_resolve_skips_overlap_check_for_delimiter_only(monkeypatch):
  275. # Delimiter-only fixed-token chunking never uses overlap, so a small
  276. # chunk_token_size below the inherited overlap must NOT be rejected.
  277. monkeypatch.setenv("CHUNK_F_OVERLAP_SIZE", "100")
  278. addon = {"chunker": default_chunker_config()}
  279. cfg = TextChunkingConfig.model_validate(
  280. {
  281. "strategy": "fixed_token",
  282. "params": {
  283. "split_by_character": "\n\n",
  284. "split_by_character_only": True,
  285. "chunk_token_size": 50,
  286. },
  287. }
  288. )
  289. _, chunk_options = _resolve_text_chunking(cfg, _stub_rag(addon))
  290. assert chunk_options["fixed_token"]["chunk_token_size"] == 50
  291. def test_resolve_enforces_overlap_when_only_flag_without_delimiter(monkeypatch):
  292. # split_by_character_only is a no-op without split_by_character: the chunker
  293. # falls back to normal token windowing, which DOES use overlap — so the
  294. # overlap < size check must still fire here.
  295. monkeypatch.setenv("CHUNK_F_OVERLAP_SIZE", "100")
  296. monkeypatch.delenv("CHUNK_F_SPLIT_BY_CHARACTER", raising=False)
  297. addon = {"chunker": default_chunker_config()}
  298. cfg = TextChunkingConfig.model_validate(
  299. {
  300. "strategy": "fixed_token",
  301. "params": {"split_by_character_only": True, "chunk_token_size": 50},
  302. }
  303. )
  304. with pytest.raises(ValueError, match="chunk_overlap_token_size"):
  305. _resolve_text_chunking(cfg, _stub_rag(addon))
  306. def test_resolve_allows_amount_over_100_with_inherited_std_type():
  307. # Request overrides only the amount; the standard_deviation type is
  308. # inherited from addon_params. std/iqr have no (0, 100] ceiling, so this
  309. # must NOT be rejected (the request model deferred the check here).
  310. addon = {
  311. "chunker": {
  312. "semantic_vector": {"breakpoint_threshold_type": "standard_deviation"}
  313. }
  314. }
  315. cfg = TextChunkingConfig.model_validate(
  316. {"strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 150}}
  317. )
  318. _, chunk_options = _resolve_text_chunking(cfg, _stub_rag(addon))
  319. assert chunk_options["semantic_vector"]["breakpoint_threshold_amount"] == 150
  320. assert (
  321. chunk_options["semantic_vector"]["breakpoint_threshold_type"]
  322. == "standard_deviation"
  323. )
  324. def test_resolve_rejects_amount_over_100_with_inherited_percentile_type():
  325. # Same partial override, but the effective (inherited) type is percentile,
  326. # which feeds np.percentile and requires the (0, 100] ceiling.
  327. addon = {
  328. "chunker": {"semantic_vector": {"breakpoint_threshold_type": "percentile"}}
  329. }
  330. cfg = TextChunkingConfig.model_validate(
  331. {"strategy": "semantic_vector", "params": {"breakpoint_threshold_amount": 150}}
  332. )
  333. with pytest.raises(ValueError, match="breakpoint_threshold_amount"):
  334. _resolve_text_chunking(cfg, _stub_rag(addon))
  335. def test_resolve_null_size_does_not_erase_inherited_default(monkeypatch):
  336. # An explicit null in the request must not overwrite the resolved size
  337. # with None (which would make the chunker do int(None) in the background).
  338. monkeypatch.setenv("CHUNK_F_SIZE", "640")
  339. addon = {"chunker": default_chunker_config()}
  340. cfg = TextChunkingConfig.model_validate(
  341. {"strategy": "fixed_token", "params": {"chunk_token_size": None}}
  342. )
  343. _, chunk_options = _resolve_text_chunking(cfg, _stub_rag(addon))
  344. # null dropped by the model -> inherited CHUNK_F_SIZE survives, no None.
  345. assert chunk_options["fixed_token"]["chunk_token_size"] == 640
  346. # ---------------------------------------------------------------------------
  347. # 3. Route forwarding + synchronous 422
  348. # ---------------------------------------------------------------------------
  349. class _FwdDocStatus:
  350. async def get_doc_by_file_basename(self, basename):
  351. return None
  352. class _FwdRag:
  353. workspace = "chunk-fwd-test"
  354. addon_params: dict = {}
  355. def __init__(self):
  356. self.doc_status = _FwdDocStatus()
  357. _HEADERS = {"X-API-Key": "test-key"}
  358. def _make_client(monkeypatch, addon_params=None):
  359. """Build a TestClient whose enqueue-slot guards are no-ops and whose
  360. ``pipeline_index_texts`` is a spy recording the forwarded args.
  361. ``addon_params`` seeds the rag the routes resolve chunking against; the
  362. handler calls the real ``_resolve_text_chunking`` synchronously, so the
  363. effective-overlap validation runs against this snapshot.
  364. """
  365. captured: dict = {}
  366. async def _spy(rag, texts, file_sources=None, track_id=None, chunking=None):
  367. captured["texts"] = texts
  368. captured["file_sources"] = file_sources
  369. captured["chunking"] = chunking
  370. async def _noop_reserve(rag):
  371. return False
  372. async def _noop_release(rag):
  373. return None
  374. monkeypatch.setattr(_dr, "pipeline_index_texts", _spy)
  375. monkeypatch.setattr(_dr, "_reserve_enqueue_slot", _noop_reserve)
  376. monkeypatch.setattr(_dr, "_release_enqueue_slot", _noop_release)
  377. rag = _FwdRag()
  378. rag.addon_params = addon_params if addon_params is not None else {}
  379. app = FastAPI()
  380. app.include_router(
  381. create_document_routes(rag, SimpleNamespace(), api_key="test-key")
  382. )
  383. return TestClient(app), captured
  384. def test_insert_text_forwards_chunking(monkeypatch):
  385. client, captured = _make_client(monkeypatch)
  386. resp = client.post(
  387. "/documents/text",
  388. headers=_HEADERS,
  389. json={
  390. "text": "hello world",
  391. "file_source": "a.md",
  392. "chunking": {
  393. "strategy": "recursive_character",
  394. "params": {"chunk_token_size": 1000, "separators": ["X"]},
  395. },
  396. },
  397. )
  398. assert resp.status_code == 200
  399. assert captured["chunking"] is not None
  400. assert captured["chunking"].strategy == "recursive_character"
  401. assert captured["chunking"].params == {
  402. "chunk_token_size": 1000,
  403. "separators": ["X"],
  404. }
  405. def test_insert_texts_forwards_chunking(monkeypatch):
  406. client, captured = _make_client(monkeypatch)
  407. resp = client.post(
  408. "/documents/texts",
  409. headers=_HEADERS,
  410. json={
  411. "texts": ["one", "two"],
  412. "file_sources": ["a.md", "b.md"],
  413. "chunking": {"strategy": "semantic_vector", "params": {"buffer_size": 2}},
  414. },
  415. )
  416. assert resp.status_code == 200
  417. assert captured["chunking"].strategy == "semantic_vector"
  418. assert captured["chunking"].params == {"buffer_size": 2}
  419. def test_insert_text_without_chunking_forwards_none(monkeypatch):
  420. client, captured = _make_client(monkeypatch)
  421. resp = client.post(
  422. "/documents/text",
  423. headers=_HEADERS,
  424. json={"text": "hello", "file_source": "a.md"},
  425. )
  426. assert resp.status_code == 200
  427. assert captured["chunking"] is None
  428. def test_insert_text_returns_422_on_malformed_chunking_without_scheduling(monkeypatch):
  429. client, captured = _make_client(monkeypatch)
  430. resp = client.post(
  431. "/documents/text",
  432. headers=_HEADERS,
  433. json={
  434. "text": "hello",
  435. "file_source": "a.md",
  436. "chunking": {
  437. "strategy": "recursive_character",
  438. "params": {"separators": "notalist"},
  439. },
  440. },
  441. )
  442. assert resp.status_code == 422
  443. # Body validation fails before the endpoint body runs: no background
  444. # indexing is scheduled, so the spy never fires.
  445. assert captured == {}
  446. def test_insert_text_returns_422_when_size_below_inherited_overlap(monkeypatch):
  447. # chunk_token_size=50 in the request, overlap=100 inherited from the
  448. # rag's addon_params (not in the request). The model can't catch this;
  449. # the handler's synchronous _resolve_text_chunking must, BEFORE any
  450. # background work is scheduled.
  451. addon = {
  452. "chunker": {
  453. "chunk_token_size": 1200,
  454. "fixed_token": {"chunk_overlap_token_size": 100},
  455. }
  456. }
  457. client, captured = _make_client(monkeypatch, addon_params=addon)
  458. resp = client.post(
  459. "/documents/text",
  460. headers=_HEADERS,
  461. json={
  462. "text": "hello",
  463. "file_source": "a.md",
  464. "chunking": {"strategy": "fixed_token", "params": {"chunk_token_size": 50}},
  465. },
  466. )
  467. assert resp.status_code == 422
  468. assert "chunk_overlap_token_size" in resp.json()["detail"]
  469. # Rejected synchronously: background indexing never scheduled.
  470. assert captured == {}
  471. def test_insert_text_allows_amount_override_inheriting_std_type(monkeypatch):
  472. # Reviewer scenario: deployment sets standard_deviation; a request
  473. # overrides only breakpoint_threshold_amount (> 100). This must be
  474. # accepted (not 422), since std has no (0, 100] ceiling.
  475. addon = {
  476. "chunker": {
  477. "semantic_vector": {"breakpoint_threshold_type": "standard_deviation"}
  478. }
  479. }
  480. client, captured = _make_client(monkeypatch, addon_params=addon)
  481. resp = client.post(
  482. "/documents/text",
  483. headers=_HEADERS,
  484. json={
  485. "text": "hello",
  486. "file_source": "a.md",
  487. "chunking": {
  488. "strategy": "semantic_vector",
  489. "params": {"breakpoint_threshold_amount": 150},
  490. },
  491. },
  492. )
  493. assert resp.status_code == 200
  494. assert captured["chunking"].params == {"breakpoint_threshold_amount": 150.0}
  495. def test_insert_text_rejects_amount_over_100_inheriting_percentile_type(monkeypatch):
  496. addon = {
  497. "chunker": {"semantic_vector": {"breakpoint_threshold_type": "percentile"}}
  498. }
  499. client, captured = _make_client(monkeypatch, addon_params=addon)
  500. resp = client.post(
  501. "/documents/text",
  502. headers=_HEADERS,
  503. json={
  504. "text": "hello",
  505. "file_source": "a.md",
  506. "chunking": {
  507. "strategy": "semantic_vector",
  508. "params": {"breakpoint_threshold_amount": 150},
  509. },
  510. },
  511. )
  512. assert resp.status_code == 422
  513. assert "breakpoint_threshold_amount" in resp.json()["detail"]
  514. assert captured == {}
  515. def test_insert_text_rejects_malformed_sentence_split_regex(monkeypatch):
  516. # Malformed regex must 422 at request parse time, before scheduling.
  517. client, captured = _make_client(monkeypatch)
  518. resp = client.post(
  519. "/documents/text",
  520. headers=_HEADERS,
  521. json={
  522. "text": "hello",
  523. "file_source": "a.md",
  524. "chunking": {
  525. "strategy": "semantic_vector",
  526. "params": {"sentence_split_regex": "("},
  527. },
  528. },
  529. )
  530. assert resp.status_code == 422
  531. assert captured == {}
  532. def test_insert_text_drops_explicit_null_param(monkeypatch):
  533. # "chunk_token_size": null must be treated as "inherit" (dropped), so the
  534. # request succeeds and the forwarded params carry no None that would later
  535. # crash the chunker with int(None).
  536. client, captured = _make_client(monkeypatch)
  537. resp = client.post(
  538. "/documents/text",
  539. headers=_HEADERS,
  540. json={
  541. "text": "hello",
  542. "file_source": "a.md",
  543. "chunking": {
  544. "strategy": "fixed_token",
  545. "params": {"chunk_token_size": None, "chunk_overlap_token_size": 50},
  546. },
  547. },
  548. )
  549. assert resp.status_code == 200
  550. assert captured["chunking"].params == {"chunk_overlap_token_size": 50}
  551. def test_insert_text_allows_small_size_for_delimiter_only(monkeypatch):
  552. # Paragraph splitting with a small chunk_token_size: overlap is inherited
  553. # (100) but unused in delimiter-only mode, so this must succeed, not 422.
  554. addon = {"chunker": {"fixed_token": {"chunk_overlap_token_size": 100}}}
  555. client, captured = _make_client(monkeypatch, addon_params=addon)
  556. resp = client.post(
  557. "/documents/text",
  558. headers=_HEADERS,
  559. json={
  560. "text": "hello",
  561. "file_source": "a.md",
  562. "chunking": {
  563. "strategy": "fixed_token",
  564. "params": {
  565. "split_by_character": "\n\n",
  566. "split_by_character_only": True,
  567. "chunk_token_size": 50,
  568. },
  569. },
  570. },
  571. )
  572. assert resp.status_code == 200
  573. assert captured["chunking"].params["chunk_token_size"] == 50