test_chunk_options_persistence.py 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554
  1. """Tests for the ``chunk_options`` snapshot mechanism.
  2. Three properties under test:
  3. 1. **env-driven snapshot**: env vars (CHUNK_R_OVERLAP_SIZE,
  4. CHUNK_V_BREAKPOINT_THRESHOLD_TYPE, …) flow into
  5. ``addon_params['chunker']`` via
  6. :func:`lightrag.parser.routing.default_chunker_config`, then into
  7. ``full_docs[doc_id]['chunk_options']`` at enqueue time via
  8. :func:`lightrag.parser.routing.resolve_chunk_options`.
  9. 2. **caller-supplied chunk_options**: an explicit ``chunk_options``
  10. kwarg passed to ``apipeline_enqueue_documents`` is persisted
  11. verbatim and reaches the dispatched chunker as keyword args.
  12. 3. **per-file chunk_options as a list**: when chunk_options is a
  13. ``list[dict]`` aligned with ``input``, each doc gets its own
  14. independent persisted snapshot.
  15. """
  16. import asyncio
  17. from pathlib import Path
  18. import numpy as np
  19. import pytest
  20. from lightrag import LightRAG, ROLES, RoleLLMConfig
  21. from lightrag.constants import DEFAULT_R_SEPARATORS
  22. from lightrag.utils import EmbeddingFunc, Tokenizer, TokenizerInterface
  23. class _SimpleTokenizerImpl(TokenizerInterface):
  24. def encode(self, content: str):
  25. return [ord(ch) for ch in content]
  26. def decode(self, tokens):
  27. return "".join(chr(t) for t in tokens)
  28. async def _mock_embedding(texts: list[str]) -> np.ndarray:
  29. return np.random.rand(len(texts), 32)
  30. async def _mock_llm(prompt, **kwargs):
  31. return '{"name":"x","summary":"s","detail_description":"d"}'
  32. _ROLE_FIELD_SUFFIXES = (
  33. ("_llm_model_func", "func"),
  34. ("_llm_model_kwargs", "kwargs"),
  35. ("_llm_model_max_async", "max_async"),
  36. ("_llm_timeout", "timeout"),
  37. )
  38. def _new_rag(tmp_path: Path, **kwargs) -> LightRAG:
  39. role_configs: dict[str, RoleLLMConfig] = {}
  40. for spec in ROLES:
  41. bucket = {}
  42. for suffix, target in _ROLE_FIELD_SUFFIXES:
  43. key = f"{spec.name}{suffix}"
  44. if key in kwargs:
  45. bucket[target] = kwargs.pop(key)
  46. if bucket:
  47. role_configs[spec.name] = RoleLLMConfig(**bucket)
  48. if role_configs:
  49. kwargs["role_llm_configs"] = role_configs
  50. return LightRAG(
  51. working_dir=str(tmp_path),
  52. workspace=f"chunkopts-{tmp_path.name}",
  53. llm_model_func=_mock_llm,
  54. embedding_func=EmbeddingFunc(
  55. embedding_dim=32,
  56. max_token_size=4096,
  57. func=_mock_embedding,
  58. ),
  59. tokenizer=Tokenizer("mock-tokenizer", _SimpleTokenizerImpl()),
  60. **kwargs,
  61. )
  62. @pytest.mark.offline
  63. def test_env_driven_snapshot_persisted_in_full_docs(tmp_path, monkeypatch):
  64. """Env vars + ainsert split args land in ``full_docs.chunk_options``.
  65. The persisted snapshot is slim — only the strategy slot selected by
  66. ``process_options`` survives — so each strategy is verified through
  67. its own enqueue with the matching selector.
  68. """
  69. monkeypatch.setenv("CHUNK_R_OVERLAP_SIZE", "42")
  70. monkeypatch.setenv("CHUNK_V_BREAKPOINT_THRESHOLD_TYPE", "interquartile")
  71. monkeypatch.setenv("CHUNK_V_BUFFER_SIZE", "3")
  72. async def _run():
  73. from lightrag.parser.routing import resolve_chunk_options
  74. rag = _new_rag(tmp_path)
  75. await rag.initialize_storages()
  76. try:
  77. # F slot — mirror what ``LightRAG.ainsert`` does: build the
  78. # F-scoped chunk_options snapshot from addon_params plus
  79. # F-strategy runtime args, then hand it to enqueue.
  80. chunk_opts_f = resolve_chunk_options(
  81. rag.addon_params,
  82. process_options="F",
  83. split_by_character="\n\n",
  84. split_by_character_only=True,
  85. )
  86. await rag.apipeline_enqueue_documents(
  87. "Body for F-strategy snapshot test.",
  88. ids=["doc-snap-f"],
  89. file_paths="snap-f.txt",
  90. track_id="track-snap-f",
  91. chunk_options=chunk_opts_f,
  92. )
  93. row_f = await rag.full_docs.get_by_id("doc-snap-f")
  94. # R slot — env-driven CHUNK_R_OVERLAP_SIZE flows through
  95. # addon_params['chunker'] into the persisted snapshot.
  96. await rag.apipeline_enqueue_documents(
  97. "Body for R-strategy snapshot test.",
  98. ids=["doc-snap-r"],
  99. file_paths="snap-r.[native-R].txt",
  100. track_id="track-snap-r",
  101. process_options="R",
  102. )
  103. row_r = await rag.full_docs.get_by_id("doc-snap-r")
  104. # V slot — env-driven CHUNK_V_* params likewise.
  105. await rag.apipeline_enqueue_documents(
  106. "Body for V-strategy snapshot test.",
  107. ids=["doc-snap-v"],
  108. file_paths="snap-v.[native-V].txt",
  109. track_id="track-snap-v",
  110. process_options="V",
  111. )
  112. row_v = await rag.full_docs.get_by_id("doc-snap-v")
  113. finally:
  114. await rag.finalize_storages()
  115. return row_f, row_r, row_v
  116. row_f, row_r, row_v = asyncio.run(_run())
  117. assert row_f is not None and row_r is not None and row_v is not None
  118. f_opts = row_f["chunk_options"]
  119. assert f_opts["fixed_token"]["split_by_character"] == "\n\n"
  120. assert f_opts["fixed_token"]["split_by_character_only"] is True
  121. # Slim contract: only the active strategy survives.
  122. assert "recursive_character" not in f_opts
  123. assert "semantic_vector" not in f_opts
  124. assert "paragraph_semantic" not in f_opts
  125. r_opts = row_r["chunk_options"]
  126. assert r_opts["recursive_character"]["chunk_overlap_token_size"] == 42
  127. assert "fixed_token" not in r_opts
  128. v_opts = row_v["chunk_options"]
  129. assert v_opts["semantic_vector"]["breakpoint_threshold_type"] == "interquartile"
  130. assert v_opts["semantic_vector"]["buffer_size"] == 3
  131. assert "fixed_token" not in v_opts
  132. @pytest.mark.offline
  133. def test_caller_supplied_chunk_options_reach_chunker(tmp_path, monkeypatch):
  134. """A caller-supplied ``chunk_options`` dict is persisted verbatim
  135. and the dispatcher splats it into the chunker call."""
  136. pytest.importorskip("langchain_text_splitters")
  137. import lightrag.chunker as chunker_pkg
  138. custom_options = {
  139. "chunk_token_size": 100,
  140. "fixed_token": {
  141. "chunk_overlap_token_size": 5,
  142. "split_by_character": None,
  143. "split_by_character_only": False,
  144. },
  145. "recursive_character": {
  146. "chunk_overlap_token_size": 0,
  147. "separators": ["|", ""],
  148. },
  149. "semantic_vector": {
  150. "breakpoint_threshold_type": "percentile",
  151. "breakpoint_threshold_amount": None,
  152. "buffer_size": 1,
  153. },
  154. "paragraph_semantic": {},
  155. }
  156. captured: dict = {}
  157. def _r_spy(tokenizer, content, chunk_token_size, **kwargs):
  158. captured["chunk_token_size"] = chunk_token_size
  159. captured["kwargs"] = dict(kwargs)
  160. return [
  161. {"tokens": 5, "content": "stub", "chunk_order_index": 0},
  162. ]
  163. monkeypatch.setattr(chunker_pkg, "chunking_by_recursive_character", _r_spy)
  164. async def _run():
  165. rag = _new_rag(tmp_path)
  166. await rag.initialize_storages()
  167. try:
  168. await rag.apipeline_enqueue_documents(
  169. "alpha|beta|gamma|delta",
  170. file_paths="caller.[native-R].txt",
  171. track_id="track-caller",
  172. process_options="R",
  173. chunk_options=custom_options,
  174. )
  175. await rag.apipeline_process_enqueue_documents()
  176. finally:
  177. await rag.finalize_storages()
  178. asyncio.run(_run())
  179. assert (
  180. captured.get("chunk_token_size") == 100
  181. ), f"R chunker must receive caller-supplied chunk_token_size; got {captured!r}"
  182. assert captured["kwargs"]["separators"] == ["|", ""]
  183. assert captured["kwargs"]["chunk_overlap_token_size"] == 0
  184. @pytest.mark.offline
  185. def test_per_file_chunk_options_list(tmp_path, monkeypatch):
  186. """A ``chunk_options`` list aligned with ``input`` writes
  187. independent snapshots per doc.
  188. The two docs use ``process_options="R"`` so the slim snapshot
  189. keeps their distinct R-strategy params; F/V/P sub-dicts in the
  190. caller-supplied input are dropped by design.
  191. """
  192. opts_a = {
  193. "chunk_token_size": 1200,
  194. "fixed_token": {
  195. "chunk_overlap_token_size": 100,
  196. "split_by_character": None,
  197. "split_by_character_only": False,
  198. },
  199. "recursive_character": {
  200. "chunk_overlap_token_size": 100,
  201. "separators": ["A_SEP"],
  202. },
  203. "semantic_vector": {
  204. "breakpoint_threshold_type": "percentile",
  205. "breakpoint_threshold_amount": None,
  206. "buffer_size": 1,
  207. },
  208. "paragraph_semantic": {},
  209. }
  210. opts_b = {
  211. "chunk_token_size": 1200,
  212. "fixed_token": {
  213. "chunk_overlap_token_size": 100,
  214. "split_by_character": None,
  215. "split_by_character_only": False,
  216. },
  217. "recursive_character": {
  218. "chunk_overlap_token_size": 100,
  219. "separators": ["B_SEP"],
  220. },
  221. "semantic_vector": {
  222. "breakpoint_threshold_type": "percentile",
  223. "breakpoint_threshold_amount": None,
  224. "buffer_size": 1,
  225. },
  226. "paragraph_semantic": {},
  227. }
  228. async def _run():
  229. rag = _new_rag(tmp_path)
  230. await rag.initialize_storages()
  231. try:
  232. await rag.apipeline_enqueue_documents(
  233. ["doc one body", "doc two body"],
  234. ids=["doc-aaaaa-list", "doc-bbbbb-list"],
  235. file_paths=["a.[native-R].txt", "b.[native-R].txt"],
  236. track_id="track-list",
  237. process_options=["R", "R"],
  238. chunk_options=[opts_a, opts_b],
  239. )
  240. row_a = await rag.full_docs.get_by_id("doc-aaaaa-list")
  241. row_b = await rag.full_docs.get_by_id("doc-bbbbb-list")
  242. finally:
  243. await rag.finalize_storages()
  244. return row_a, row_b
  245. row_a, row_b = asyncio.run(_run())
  246. assert row_a is not None and row_b is not None
  247. sep_a = row_a["chunk_options"]["recursive_character"]["separators"]
  248. sep_b = row_b["chunk_options"]["recursive_character"]["separators"]
  249. assert sep_a == ["A_SEP"]
  250. assert sep_b == ["B_SEP"]
  251. # Independence: mutating one snapshot must not bleed into the other.
  252. sep_a.append("MUT")
  253. assert "MUT" not in row_b["chunk_options"]["recursive_character"]["separators"]
  254. # Slim contract: non-R strategy slots are dropped from the persisted
  255. # snapshot since they would never be consumed at process time.
  256. assert "fixed_token" not in row_a["chunk_options"]
  257. assert "semantic_vector" not in row_a["chunk_options"]
  258. assert "paragraph_semantic" not in row_a["chunk_options"]
  259. @pytest.mark.offline
  260. def test_constructor_chunk_size_overlays_addon_params(tmp_path, monkeypatch):
  261. """``LightRAG(chunk_token_size=N, chunk_overlap_token_size=M)`` must
  262. actually take effect — the per-doc snapshot is built from
  263. ``addon_params['chunker']``, so the constructor values have to be
  264. overlaid onto it (otherwise env-driven defaults would silently win).
  265. """
  266. # Set env vars to non-default values so the env path would be
  267. # observably different from the constructor path.
  268. monkeypatch.setenv("CHUNK_SIZE", "1200")
  269. monkeypatch.setenv("CHUNK_OVERLAP_SIZE", "100")
  270. async def _run():
  271. rag = _new_rag(
  272. tmp_path,
  273. chunk_token_size=7,
  274. chunk_overlap_token_size=2,
  275. )
  276. await rag.initialize_storages()
  277. try:
  278. await rag.apipeline_enqueue_documents(
  279. "Body for constructor overlay test.",
  280. ids=["doc-ctor-overlay"],
  281. file_paths="ctor.txt",
  282. track_id="track-ctor",
  283. )
  284. row = await rag.full_docs.get_by_id("doc-ctor-overlay")
  285. finally:
  286. await rag.finalize_storages()
  287. return row, rag.addon_params
  288. row, addon_params = asyncio.run(_run())
  289. assert row is not None
  290. chunk_opts = row["chunk_options"]
  291. # Top-level chunk_token_size carries the constructor value.
  292. assert chunk_opts["chunk_token_size"] == 7
  293. # Default-F doc: the persisted slim snapshot only carries F's slot.
  294. assert chunk_opts["fixed_token"]["chunk_overlap_token_size"] == 2
  295. assert "recursive_character" not in chunk_opts
  296. assert "semantic_vector" not in chunk_opts
  297. assert "paragraph_semantic" not in chunk_opts
  298. # addon_params still reflects the constructor overlay across every
  299. # strategy so subsequent enqueues with other selectors pick up the
  300. # same baseline. V doesn't have chunk_overlap_token_size and must
  301. # remain unchanged.
  302. assert addon_params["chunker"]["chunk_token_size"] == 7
  303. assert addon_params["chunker"]["fixed_token"]["chunk_overlap_token_size"] == 2
  304. assert (
  305. addon_params["chunker"]["recursive_character"]["chunk_overlap_token_size"] == 2
  306. )
  307. assert (
  308. addon_params["chunker"]["paragraph_semantic"]["chunk_overlap_token_size"] == 2
  309. )
  310. assert "chunk_overlap_token_size" not in addon_params["chunker"]["semantic_vector"]
  311. @pytest.mark.offline
  312. def test_addon_params_chunker_wins_when_constructor_field_unset(tmp_path):
  313. """If the constructor field is left at its default (``None``), an
  314. explicit ``addon_params={'chunker': {...}}`` must NOT be clobbered.
  315. """
  316. async def _run():
  317. rag = _new_rag(
  318. tmp_path,
  319. addon_params={
  320. "chunker": {
  321. "chunk_token_size": 5000,
  322. "fixed_token": {
  323. "chunk_overlap_token_size": 250,
  324. "split_by_character": None,
  325. "split_by_character_only": False,
  326. },
  327. "recursive_character": {
  328. "chunk_overlap_token_size": 250,
  329. "separators": ["\n\n", "\n", " ", ""],
  330. },
  331. "semantic_vector": {
  332. "breakpoint_threshold_type": "percentile",
  333. "breakpoint_threshold_amount": None,
  334. "buffer_size": 1,
  335. },
  336. "paragraph_semantic": {},
  337. },
  338. },
  339. )
  340. await rag.initialize_storages()
  341. try:
  342. await rag.apipeline_enqueue_documents(
  343. "Body for addon-only overlay test.",
  344. ids=["doc-addon-only"],
  345. file_paths="addon.txt",
  346. track_id="track-addon",
  347. )
  348. row = await rag.full_docs.get_by_id("doc-addon-only")
  349. finally:
  350. await rag.finalize_storages()
  351. return row, rag.chunk_token_size, rag.chunk_overlap_token_size
  352. row, ctor_size, ctor_overlap = asyncio.run(_run())
  353. assert row is not None
  354. assert row["chunk_options"]["chunk_token_size"] == 5000
  355. assert row["chunk_options"]["fixed_token"]["chunk_overlap_token_size"] == 250
  356. # Legacy instance fields back-fill from addon_params (not env defaults).
  357. assert ctor_size == 5000
  358. assert ctor_overlap == 250
  359. @pytest.mark.offline
  360. def test_strategy_env_wins_over_legacy_ctor_field(tmp_path, monkeypatch):
  361. """Specificity-ordered precedence: strategy-specific env vars beat
  362. the strategy-agnostic legacy constructor field.
  363. Setup: ``CHUNK_R_OVERLAP_SIZE=42`` is strategy-specific for R.
  364. ``LightRAG(chunk_overlap_token_size=2)`` is the legacy
  365. strategy-agnostic field. R must end up at 42 (env wins on its own
  366. strategy slot), F at 2 (no F-specific env, so legacy field fills).
  367. """
  368. monkeypatch.setenv("CHUNK_R_OVERLAP_SIZE", "42")
  369. monkeypatch.delenv("CHUNK_F_OVERLAP_SIZE", raising=False)
  370. monkeypatch.delenv("CHUNK_OVERLAP_SIZE", raising=False)
  371. async def _run():
  372. rag = _new_rag(tmp_path, chunk_overlap_token_size=2)
  373. await rag.initialize_storages()
  374. try:
  375. # R-strategy doc — strategy-specific env wins.
  376. await rag.apipeline_enqueue_documents(
  377. "Body for R precedence test.",
  378. ids=["doc-prec-r"],
  379. file_paths="prec-r.[native-R].txt",
  380. track_id="track-prec-r",
  381. process_options="R",
  382. )
  383. row_r = await rag.full_docs.get_by_id("doc-prec-r")
  384. # F-strategy doc — no F-specific env, ctor field fills.
  385. await rag.apipeline_enqueue_documents(
  386. "Body for F precedence test.",
  387. ids=["doc-prec-f"],
  388. file_paths="prec-f.txt",
  389. track_id="track-prec-f",
  390. )
  391. row_f = await rag.full_docs.get_by_id("doc-prec-f")
  392. # P-strategy doc — no P-specific env, ctor field fills.
  393. await rag.apipeline_enqueue_documents(
  394. "Body for P precedence test.",
  395. ids=["doc-prec-p"],
  396. file_paths="prec-p.[native-P].txt",
  397. track_id="track-prec-p",
  398. process_options="P",
  399. )
  400. row_p = await rag.full_docs.get_by_id("doc-prec-p")
  401. finally:
  402. await rag.finalize_storages()
  403. return row_r, row_f, row_p, rag.chunk_overlap_token_size
  404. row_r, row_f, row_p, ctor_field = asyncio.run(_run())
  405. assert (
  406. row_r["chunk_options"]["recursive_character"]["chunk_overlap_token_size"] == 42
  407. ), (
  408. "Strategy-specific CHUNK_R_OVERLAP_SIZE must win over the "
  409. "legacy chunk_overlap_token_size constructor field."
  410. )
  411. assert row_f["chunk_options"]["fixed_token"]["chunk_overlap_token_size"] == 2, (
  412. "Without a CHUNK_F_OVERLAP_SIZE override, the F slot falls back "
  413. "to the legacy constructor field."
  414. )
  415. assert row_p["chunk_options"]["paragraph_semantic"]["chunk_overlap_token_size"] == 2
  416. # self.chunk_overlap_token_size mirrors the F-strategy resolved value.
  417. assert ctor_field == 2
  418. @pytest.mark.offline
  419. def test_legacy_env_is_final_fallback(tmp_path, monkeypatch):
  420. """When neither a strategy env nor the legacy ctor field is set,
  421. the legacy ``CHUNK_OVERLAP_SIZE`` env is the final fallback for
  422. every per-strategy overlap slot."""
  423. monkeypatch.delenv("CHUNK_F_OVERLAP_SIZE", raising=False)
  424. monkeypatch.delenv("CHUNK_R_OVERLAP_SIZE", raising=False)
  425. monkeypatch.setenv("CHUNK_OVERLAP_SIZE", "77")
  426. async def _run():
  427. rag = _new_rag(tmp_path) # no chunk_overlap_token_size kwarg
  428. await rag.initialize_storages()
  429. try:
  430. await rag.apipeline_enqueue_documents(
  431. ["F body", "R body", "P body"],
  432. ids=["doc-legacy-f", "doc-legacy-r", "doc-legacy-p"],
  433. file_paths=[
  434. "legacy-f.txt",
  435. "legacy-r.[native-R].txt",
  436. "legacy-p.[native-P].txt",
  437. ],
  438. track_id="track-legacy",
  439. process_options=["", "R", "P"],
  440. )
  441. row_f = await rag.full_docs.get_by_id("doc-legacy-f")
  442. row_r = await rag.full_docs.get_by_id("doc-legacy-r")
  443. row_p = await rag.full_docs.get_by_id("doc-legacy-p")
  444. finally:
  445. await rag.finalize_storages()
  446. return row_f, row_r, row_p, rag.chunk_overlap_token_size
  447. row_f, row_r, row_p, ctor_field = asyncio.run(_run())
  448. assert row_f["chunk_options"]["fixed_token"]["chunk_overlap_token_size"] == 77
  449. assert (
  450. row_r["chunk_options"]["recursive_character"]["chunk_overlap_token_size"] == 77
  451. )
  452. assert (
  453. row_p["chunk_options"]["paragraph_semantic"]["chunk_overlap_token_size"] == 77
  454. )
  455. assert ctor_field == 77
  456. # Mixed case: F-specific env set, legacy still acts as R's fallback.
  457. monkeypatch.setenv("CHUNK_F_OVERLAP_SIZE", "10")
  458. async def _run_mixed():
  459. rag = _new_rag(tmp_path)
  460. await rag.initialize_storages()
  461. try:
  462. await rag.apipeline_enqueue_documents(
  463. ["F mixed body", "R mixed body", "P mixed body"],
  464. ids=["doc-mixed-f", "doc-mixed-r", "doc-mixed-p"],
  465. file_paths=[
  466. "mixed-f.txt",
  467. "mixed-r.[native-R].txt",
  468. "mixed-p.[native-P].txt",
  469. ],
  470. track_id="track-mixed",
  471. process_options=["", "R", "P"],
  472. )
  473. row_f = await rag.full_docs.get_by_id("doc-mixed-f")
  474. row_r = await rag.full_docs.get_by_id("doc-mixed-r")
  475. row_p = await rag.full_docs.get_by_id("doc-mixed-p")
  476. finally:
  477. await rag.finalize_storages()
  478. return row_f, row_r, row_p
  479. row_f, row_r, row_p = asyncio.run(_run_mixed())
  480. assert row_f["chunk_options"]["fixed_token"]["chunk_overlap_token_size"] == 10
  481. assert (
  482. row_r["chunk_options"]["recursive_character"]["chunk_overlap_token_size"] == 77
  483. )
  484. assert (
  485. row_p["chunk_options"]["paragraph_semantic"]["chunk_overlap_token_size"] == 77
  486. )
  487. @pytest.mark.offline
  488. def test_p_strategy_uses_dedicated_chunk_size_env(tmp_path, monkeypatch):
  489. """``CHUNK_P_SIZE`` must give P its own ``chunk_token_size``,
  490. decoupled from the global ``CHUNK_SIZE`` shared by F/R/V."""
  491. monkeypatch.setenv("CHUNK_SIZE", "1200")
  492. monkeypatch.setenv("CHUNK_P_SIZE", "999")
  493. import lightrag.chunker as chunker_pkg
  494. captured: dict = {}
  495. def _p_spy(tokenizer, content, chunk_token_size, *, blocks_path=None, **kwargs):
  496. captured["chunk_token_size"] = chunk_token_size
  497. captured["blocks_path"] = blocks_path
  498. captured["kwargs"] = dict(kwargs)
  499. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  500. monkeypatch.setattr(chunker_pkg, "chunking_by_paragraph_semantic", _p_spy)
  501. async def _run():
  502. rag = _new_rag(tmp_path)
  503. await rag.initialize_storages()
  504. try:
  505. await rag.apipeline_enqueue_documents(
  506. "stand-in body for paragraph-semantic chunker",
  507. file_paths="ctor.[native-P].txt",
  508. track_id="track-p-size",
  509. process_options="P",
  510. )
  511. await rag.apipeline_process_enqueue_documents()
  512. finally:
  513. await rag.finalize_storages()
  514. asyncio.run(_run())
  515. assert captured.get("chunk_token_size") == 999, (
  516. "P chunker must receive CHUNK_P_SIZE-derived chunk_token_size, "
  517. f"not the global CHUNK_SIZE; got {captured!r}"
  518. )
  519. # And the dispatcher must not double-pass chunk_token_size as kwarg.
  520. assert "chunk_token_size" not in captured["kwargs"]
  521. @pytest.mark.offline
  522. def test_p_strategy_defaults_to_dedicated_size_when_env_unset(tmp_path, monkeypatch):
  523. """When ``CHUNK_P_SIZE`` is unset, P uses ``DEFAULT_CHUNK_P_SIZE``
  524. rather than inheriting the global ``CHUNK_SIZE`` or
  525. ``LightRAG(chunk_token_size=…)``. Paragraph-semantic merging needs
  526. more headroom than the global default to keep related paragraphs
  527. together; silently inheriting the smaller global ceiling defeats
  528. the strategy's purpose."""
  529. from lightrag.constants import DEFAULT_CHUNK_P_SIZE
  530. monkeypatch.delenv("CHUNK_P_SIZE", raising=False)
  531. monkeypatch.delenv("CHUNK_SIZE", raising=False)
  532. import lightrag.chunker as chunker_pkg
  533. captured: dict = {}
  534. def _p_spy(tokenizer, content, chunk_token_size, *, blocks_path=None, **kwargs):
  535. captured["chunk_token_size"] = chunk_token_size
  536. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  537. monkeypatch.setattr(chunker_pkg, "chunking_by_paragraph_semantic", _p_spy)
  538. async def _run():
  539. # Pass an explicit ctor chunk_token_size that differs from the
  540. # P default — proves P is decoupled from the global chain.
  541. rag = _new_rag(tmp_path, chunk_token_size=333)
  542. await rag.initialize_storages()
  543. try:
  544. await rag.apipeline_enqueue_documents(
  545. "fallback body",
  546. file_paths="ctor.[native-P].txt",
  547. track_id="track-p-fallback",
  548. process_options="P",
  549. )
  550. await rag.apipeline_process_enqueue_documents()
  551. finally:
  552. await rag.finalize_storages()
  553. asyncio.run(_run())
  554. assert captured.get("chunk_token_size") == DEFAULT_CHUNK_P_SIZE
  555. @pytest.mark.offline
  556. def test_p_strategy_default_size_survives_partial_addon_params(tmp_path, monkeypatch):
  557. """When the caller hands in a partial ``addon_params['chunker']``
  558. that lacks ``paragraph_semantic.chunk_token_size``,
  559. ``normalize_addon_params`` does NOT re-run ``default_chunker_config``,
  560. so the slot would silently fall back to the top-level resolved
  561. chunk size in the pipeline. ``_apply_chunk_size_overlay`` backfills
  562. ``DEFAULT_CHUNK_P_SIZE`` as the last guard."""
  563. from lightrag.constants import DEFAULT_CHUNK_P_SIZE
  564. monkeypatch.delenv("CHUNK_P_SIZE", raising=False)
  565. monkeypatch.delenv("CHUNK_SIZE", raising=False)
  566. import lightrag.chunker as chunker_pkg
  567. captured: dict = {}
  568. def _p_spy(tokenizer, content, chunk_token_size, *, blocks_path=None, **kwargs):
  569. captured["chunk_token_size"] = chunk_token_size
  570. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  571. monkeypatch.setattr(chunker_pkg, "chunking_by_paragraph_semantic", _p_spy)
  572. async def _run():
  573. rag = _new_rag(
  574. tmp_path,
  575. chunk_token_size=333,
  576. addon_params={"chunker": {"paragraph_semantic": {}}},
  577. )
  578. await rag.initialize_storages()
  579. try:
  580. await rag.apipeline_enqueue_documents(
  581. "partial addon body",
  582. file_paths="ctor.[native-P].txt",
  583. track_id="track-p-partial",
  584. process_options="P",
  585. )
  586. await rag.apipeline_process_enqueue_documents()
  587. finally:
  588. await rag.finalize_storages()
  589. asyncio.run(_run())
  590. assert captured.get("chunk_token_size") == DEFAULT_CHUNK_P_SIZE, (
  591. "P chunker must use DEFAULT_CHUNK_P_SIZE even when caller passes "
  592. "a partial addon_params chunker dict; got "
  593. f"{captured.get('chunk_token_size')!r}"
  594. )
  595. @pytest.mark.offline
  596. def test_p_strategy_partial_addon_params_still_picks_up_env(tmp_path, monkeypatch):
  597. """When the caller hands in a partial ``addon_params['chunker']``
  598. that lacks ``paragraph_semantic.chunk_token_size`` AND
  599. ``CHUNK_P_SIZE`` env IS set, the overlay must pick up the env
  600. value rather than skipping straight to ``DEFAULT_CHUNK_P_SIZE``.
  601. Precedence: explicit addon_params > CHUNK_P_SIZE env >
  602. DEFAULT_CHUNK_P_SIZE. Without env-aware backfill the partial-
  603. addon-params path silently ignores deployment .env settings."""
  604. monkeypatch.setenv("CHUNK_P_SIZE", "4096")
  605. monkeypatch.delenv("CHUNK_SIZE", raising=False)
  606. import lightrag.chunker as chunker_pkg
  607. captured: dict = {}
  608. def _p_spy(tokenizer, content, chunk_token_size, *, blocks_path=None, **kwargs):
  609. captured["chunk_token_size"] = chunk_token_size
  610. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  611. monkeypatch.setattr(chunker_pkg, "chunking_by_paragraph_semantic", _p_spy)
  612. async def _run():
  613. rag = _new_rag(
  614. tmp_path,
  615. chunk_token_size=333,
  616. addon_params={"chunker": {"paragraph_semantic": {}}},
  617. )
  618. await rag.initialize_storages()
  619. try:
  620. await rag.apipeline_enqueue_documents(
  621. "partial addon body with env",
  622. file_paths="ctor.[native-P].txt",
  623. track_id="track-p-partial-env",
  624. process_options="P",
  625. )
  626. await rag.apipeline_process_enqueue_documents()
  627. finally:
  628. await rag.finalize_storages()
  629. asyncio.run(_run())
  630. assert captured.get("chunk_token_size") == 4096, (
  631. "Partial addon_params must not mask CHUNK_P_SIZE env; got "
  632. f"{captured.get('chunk_token_size')!r}"
  633. )
  634. @pytest.mark.offline
  635. def test_p_strategy_runtime_chunker_mutation_picks_up_env(tmp_path, monkeypatch):
  636. """Runtime mutation via ``rag.addon_params["chunker"] = {...}``
  637. triggers ``ObservableAddonParams.__setitem__`` which only marks
  638. addon_params dirty — it does NOT re-run
  639. ``_apply_chunk_size_overlay``. ``resolve_chunk_options`` is the
  640. last chokepoint and must backfill P's chunk_token_size from
  641. ``CHUNK_P_SIZE`` env (or ``DEFAULT_CHUNK_P_SIZE``) when the
  642. mutation left the slot empty.
  643. Without that backfill, P silently inherits the top-level
  644. ``chunk_token_size`` (here ``333``) — the exact failure mode the
  645. dedicated default exists to prevent."""
  646. monkeypatch.setenv("CHUNK_P_SIZE", "4096")
  647. monkeypatch.delenv("CHUNK_SIZE", raising=False)
  648. import lightrag.chunker as chunker_pkg
  649. captured: dict = {}
  650. def _p_spy(tokenizer, content, chunk_token_size, *, blocks_path=None, **kwargs):
  651. captured["chunk_token_size"] = chunk_token_size
  652. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  653. monkeypatch.setattr(chunker_pkg, "chunking_by_paragraph_semantic", _p_spy)
  654. async def _run():
  655. rag = _new_rag(tmp_path, chunk_token_size=333)
  656. await rag.initialize_storages()
  657. try:
  658. # Subscript assignment — bypasses _apply_chunk_size_overlay.
  659. rag.addon_params["chunker"] = {"paragraph_semantic": {}}
  660. await rag.apipeline_enqueue_documents(
  661. "runtime mutation body",
  662. file_paths="ctor.[native-P].txt",
  663. track_id="track-p-runtime",
  664. process_options="P",
  665. )
  666. await rag.apipeline_process_enqueue_documents()
  667. finally:
  668. await rag.finalize_storages()
  669. asyncio.run(_run())
  670. assert captured.get("chunk_token_size") == 4096, (
  671. "Runtime chunker mutation must not let P inherit the top-level "
  672. f"chunk_token_size; got {captured.get('chunk_token_size')!r}"
  673. )
  674. @pytest.mark.offline
  675. def test_p_strategy_runtime_chunker_mutation_uses_default_when_env_unset(
  676. tmp_path, monkeypatch
  677. ):
  678. """Sibling of the env-aware case: with ``CHUNK_P_SIZE`` unset,
  679. runtime-mutation enqueue still gets ``DEFAULT_CHUNK_P_SIZE``
  680. rather than the top-level ``chunk_token_size``."""
  681. from lightrag.constants import DEFAULT_CHUNK_P_SIZE
  682. monkeypatch.delenv("CHUNK_P_SIZE", raising=False)
  683. monkeypatch.delenv("CHUNK_SIZE", raising=False)
  684. import lightrag.chunker as chunker_pkg
  685. captured: dict = {}
  686. def _p_spy(tokenizer, content, chunk_token_size, *, blocks_path=None, **kwargs):
  687. captured["chunk_token_size"] = chunk_token_size
  688. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  689. monkeypatch.setattr(chunker_pkg, "chunking_by_paragraph_semantic", _p_spy)
  690. async def _run():
  691. rag = _new_rag(tmp_path, chunk_token_size=333)
  692. await rag.initialize_storages()
  693. try:
  694. rag.addon_params["chunker"] = {"paragraph_semantic": {}}
  695. await rag.apipeline_enqueue_documents(
  696. "runtime mutation default body",
  697. file_paths="ctor.[native-P].txt",
  698. track_id="track-p-runtime-default",
  699. process_options="P",
  700. )
  701. await rag.apipeline_process_enqueue_documents()
  702. finally:
  703. await rag.finalize_storages()
  704. asyncio.run(_run())
  705. assert captured.get("chunk_token_size") == DEFAULT_CHUNK_P_SIZE
  706. @pytest.mark.offline
  707. def test_p_strategy_caller_chunk_options_picks_up_env(tmp_path, monkeypatch):
  708. """``apipeline_enqueue_documents(..., chunk_options=...)`` skips
  709. ``resolve_chunk_options`` and goes through ``slim_chunk_options``
  710. directly. The P backfill must still kick in there so an
  711. explicit ``chunk_options`` that omits the P slot does not let P
  712. fall back to the top-level ``chunk_token_size``."""
  713. monkeypatch.setenv("CHUNK_P_SIZE", "4096")
  714. monkeypatch.delenv("CHUNK_SIZE", raising=False)
  715. import lightrag.chunker as chunker_pkg
  716. captured: dict = {}
  717. def _p_spy(tokenizer, content, chunk_token_size, *, blocks_path=None, **kwargs):
  718. captured["chunk_token_size"] = chunk_token_size
  719. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  720. monkeypatch.setattr(chunker_pkg, "chunking_by_paragraph_semantic", _p_spy)
  721. async def _run():
  722. rag = _new_rag(tmp_path, chunk_token_size=333)
  723. await rag.initialize_storages()
  724. try:
  725. await rag.apipeline_enqueue_documents(
  726. "caller chunk_options body",
  727. file_paths="ctor.[native-P].txt",
  728. track_id="track-p-caller-chunkopts",
  729. process_options="P",
  730. # Explicit kwarg path — bypasses resolve_chunk_options.
  731. # Also includes a top-level chunk_token_size to verify
  732. # P does NOT inherit it.
  733. chunk_options={
  734. "chunk_token_size": 333,
  735. "paragraph_semantic": {},
  736. },
  737. )
  738. await rag.apipeline_process_enqueue_documents()
  739. finally:
  740. await rag.finalize_storages()
  741. asyncio.run(_run())
  742. assert captured.get("chunk_token_size") == 4096, (
  743. "P must not inherit caller-supplied top-level chunk_token_size; "
  744. f"got {captured.get('chunk_token_size')!r}"
  745. )
  746. @pytest.mark.offline
  747. def test_p_strategy_caller_chunk_options_uses_default_when_env_unset(
  748. tmp_path, monkeypatch
  749. ):
  750. """Sibling of the env-aware case: with ``CHUNK_P_SIZE`` unset and
  751. a caller-supplied ``chunk_options`` that omits the P slot, the
  752. P backfill resolves to ``DEFAULT_CHUNK_P_SIZE`` — not the
  753. caller's top-level ``chunk_token_size``."""
  754. from lightrag.constants import DEFAULT_CHUNK_P_SIZE
  755. monkeypatch.delenv("CHUNK_P_SIZE", raising=False)
  756. monkeypatch.delenv("CHUNK_SIZE", raising=False)
  757. import lightrag.chunker as chunker_pkg
  758. captured: dict = {}
  759. def _p_spy(tokenizer, content, chunk_token_size, *, blocks_path=None, **kwargs):
  760. captured["chunk_token_size"] = chunk_token_size
  761. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  762. monkeypatch.setattr(chunker_pkg, "chunking_by_paragraph_semantic", _p_spy)
  763. async def _run():
  764. rag = _new_rag(tmp_path, chunk_token_size=333)
  765. await rag.initialize_storages()
  766. try:
  767. await rag.apipeline_enqueue_documents(
  768. "caller chunk_options default body",
  769. file_paths="ctor.[native-P].txt",
  770. track_id="track-p-caller-default",
  771. process_options="P",
  772. chunk_options={
  773. "chunk_token_size": 333,
  774. "paragraph_semantic": {},
  775. },
  776. )
  777. await rag.apipeline_process_enqueue_documents()
  778. finally:
  779. await rag.finalize_storages()
  780. asyncio.run(_run())
  781. assert captured.get("chunk_token_size") == DEFAULT_CHUNK_P_SIZE
  782. @pytest.mark.offline
  783. def test_p_strategy_caller_chunk_options_respects_explicit_p_size(
  784. tmp_path, monkeypatch
  785. ):
  786. """Caller-supplied ``chunk_options`` carrying an explicit
  787. ``paragraph_semantic.chunk_token_size`` must win over both env
  788. and ``DEFAULT_CHUNK_P_SIZE``."""
  789. monkeypatch.setenv("CHUNK_P_SIZE", "4096")
  790. monkeypatch.delenv("CHUNK_SIZE", raising=False)
  791. import lightrag.chunker as chunker_pkg
  792. captured: dict = {}
  793. def _p_spy(tokenizer, content, chunk_token_size, *, blocks_path=None, **kwargs):
  794. captured["chunk_token_size"] = chunk_token_size
  795. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  796. monkeypatch.setattr(chunker_pkg, "chunking_by_paragraph_semantic", _p_spy)
  797. async def _run():
  798. rag = _new_rag(tmp_path)
  799. await rag.initialize_storages()
  800. try:
  801. await rag.apipeline_enqueue_documents(
  802. "caller chunk_options explicit P size body",
  803. file_paths="ctor.[native-P].txt",
  804. track_id="track-p-caller-explicit",
  805. process_options="P",
  806. chunk_options={
  807. "paragraph_semantic": {"chunk_token_size": 8192},
  808. },
  809. )
  810. await rag.apipeline_process_enqueue_documents()
  811. finally:
  812. await rag.finalize_storages()
  813. asyncio.run(_run())
  814. assert captured.get("chunk_token_size") == 8192
  815. @pytest.mark.offline
  816. def test_p_strategy_respects_explicit_addon_params_chunk_size(tmp_path, monkeypatch):
  817. """``setdefault`` must not clobber an explicit
  818. ``paragraph_semantic.chunk_token_size`` the caller did provide."""
  819. monkeypatch.delenv("CHUNK_P_SIZE", raising=False)
  820. monkeypatch.delenv("CHUNK_SIZE", raising=False)
  821. import lightrag.chunker as chunker_pkg
  822. captured: dict = {}
  823. def _p_spy(tokenizer, content, chunk_token_size, *, blocks_path=None, **kwargs):
  824. captured["chunk_token_size"] = chunk_token_size
  825. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  826. monkeypatch.setattr(chunker_pkg, "chunking_by_paragraph_semantic", _p_spy)
  827. async def _run():
  828. rag = _new_rag(
  829. tmp_path,
  830. addon_params={
  831. "chunker": {"paragraph_semantic": {"chunk_token_size": 4096}}
  832. },
  833. )
  834. await rag.initialize_storages()
  835. try:
  836. await rag.apipeline_enqueue_documents(
  837. "explicit addon body",
  838. file_paths="ctor.[native-P].txt",
  839. track_id="track-p-explicit",
  840. process_options="P",
  841. )
  842. await rag.apipeline_process_enqueue_documents()
  843. finally:
  844. await rag.finalize_storages()
  845. asyncio.run(_run())
  846. assert captured.get("chunk_token_size") == 4096
  847. @pytest.mark.offline
  848. def test_p_strategy_uses_dedicated_overlap_env(tmp_path, monkeypatch):
  849. monkeypatch.setenv("CHUNK_OVERLAP_SIZE", "11")
  850. monkeypatch.setenv("CHUNK_P_OVERLAP_SIZE", "66")
  851. import lightrag.chunker as chunker_pkg
  852. captured: dict = {}
  853. def _p_spy(tokenizer, content, chunk_token_size, *, blocks_path=None, **kwargs):
  854. captured["kwargs"] = dict(kwargs)
  855. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  856. monkeypatch.setattr(chunker_pkg, "chunking_by_paragraph_semantic", _p_spy)
  857. async def _run():
  858. rag = _new_rag(tmp_path)
  859. await rag.initialize_storages()
  860. try:
  861. await rag.apipeline_enqueue_documents(
  862. "P overlap body",
  863. ids=["doc-p-overlap"],
  864. file_paths="ctor.[native-P].txt",
  865. track_id="track-p-overlap",
  866. process_options="P",
  867. )
  868. row = await rag.full_docs.get_by_id("doc-p-overlap")
  869. await rag.apipeline_process_enqueue_documents()
  870. finally:
  871. await rag.finalize_storages()
  872. return row
  873. row = asyncio.run(_run())
  874. assert row["chunk_options"]["paragraph_semantic"]["chunk_overlap_token_size"] == 66
  875. assert captured["kwargs"]["chunk_overlap_token_size"] == 66
  876. @pytest.mark.offline
  877. def test_addon_params_strategy_wins_over_strategy_env(tmp_path, monkeypatch):
  878. """Highest tier check: a value sitting in
  879. ``addon_params['chunker'][<strategy>]['chunk_overlap_token_size']``
  880. must beat even a strategy-specific env."""
  881. monkeypatch.setenv("CHUNK_R_OVERLAP_SIZE", "42")
  882. async def _run():
  883. rag = _new_rag(
  884. tmp_path,
  885. addon_params={
  886. "chunker": {
  887. "recursive_character": {
  888. "chunk_overlap_token_size": 999,
  889. "separators": ["\n\n", "\n", " ", ""],
  890. },
  891. },
  892. },
  893. )
  894. await rag.initialize_storages()
  895. try:
  896. await rag.apipeline_enqueue_documents(
  897. "Body for addon-vs-env precedence test.",
  898. ids=["doc-addon-vs-env"],
  899. file_paths="addon.[native-R].txt",
  900. track_id="track-addon",
  901. process_options="R",
  902. )
  903. row = await rag.full_docs.get_by_id("doc-addon-vs-env")
  904. finally:
  905. await rag.finalize_storages()
  906. return row
  907. row = asyncio.run(_run())
  908. chunk_opts = row["chunk_options"]
  909. assert (
  910. chunk_opts["recursive_character"]["chunk_overlap_token_size"] == 999
  911. ), "addon_params explicit value must beat strategy-specific env."
  912. @pytest.mark.offline
  913. def test_runtime_addon_params_mutation_affects_subsequent_enqueue(tmp_path):
  914. """Mutating ``rag.addon_params['chunker']`` after construction must
  915. take effect for documents enqueued *after* the mutation, while
  916. documents enqueued *before* keep their frozen snapshot.
  917. """
  918. async def _run():
  919. rag = _new_rag(tmp_path)
  920. await rag.initialize_storages()
  921. try:
  922. # Doc A enqueued under default config (R strategy so the
  923. # mutated separators land in the persisted slim snapshot).
  924. await rag.apipeline_enqueue_documents(
  925. "first doc body",
  926. ids=["doc-pre-mutation"],
  927. file_paths=["pre.[native-R].txt"],
  928. track_id="track-pre",
  929. process_options="R",
  930. )
  931. row_pre = await rag.full_docs.get_by_id("doc-pre-mutation")
  932. sep_pre = list(
  933. row_pre["chunk_options"]["recursive_character"]["separators"]
  934. )
  935. # Mutate the runtime defaults.
  936. rag.addon_params["chunker"]["recursive_character"]["separators"] = [
  937. "##",
  938. "\n",
  939. ]
  940. # Doc B enqueued under the mutated defaults.
  941. await rag.apipeline_enqueue_documents(
  942. "second doc body",
  943. ids=["doc-post-mutation"],
  944. file_paths=["post.[native-R].txt"],
  945. track_id="track-post",
  946. process_options="R",
  947. )
  948. row_post = await rag.full_docs.get_by_id("doc-post-mutation")
  949. finally:
  950. await rag.finalize_storages()
  951. return sep_pre, row_post
  952. sep_pre, row_post = asyncio.run(_run())
  953. # Pre-mutation doc keeps the env-driven default cascade.
  954. assert sep_pre == list(DEFAULT_R_SEPARATORS)
  955. # Post-mutation doc reflects the runtime change.
  956. assert row_post["chunk_options"]["recursive_character"]["separators"] == [
  957. "##",
  958. "\n",
  959. ]
  960. @pytest.mark.offline
  961. def test_r_strategy_uses_dedicated_chunk_size_env(tmp_path, monkeypatch):
  962. """``CHUNK_R_SIZE`` must give R its own ``chunk_token_size``,
  963. decoupled from the global ``CHUNK_SIZE`` shared by F/V."""
  964. monkeypatch.setenv("CHUNK_SIZE", "1200")
  965. monkeypatch.setenv("CHUNK_R_SIZE", "777")
  966. import lightrag.chunker as chunker_pkg
  967. captured: dict = {}
  968. def _r_spy(tokenizer, content, chunk_token_size, **kwargs):
  969. captured["chunk_token_size"] = chunk_token_size
  970. captured["kwargs"] = dict(kwargs)
  971. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  972. monkeypatch.setattr(chunker_pkg, "chunking_by_recursive_character", _r_spy)
  973. async def _run():
  974. rag = _new_rag(tmp_path)
  975. await rag.initialize_storages()
  976. try:
  977. await rag.apipeline_enqueue_documents(
  978. "stand-in body for recursive-character chunker",
  979. file_paths="ctor.[native-R].txt",
  980. track_id="track-r-size",
  981. process_options="R",
  982. )
  983. await rag.apipeline_process_enqueue_documents()
  984. finally:
  985. await rag.finalize_storages()
  986. asyncio.run(_run())
  987. assert captured.get("chunk_token_size") == 777, (
  988. "R chunker must receive CHUNK_R_SIZE-derived chunk_token_size, "
  989. f"not the global CHUNK_SIZE; got {captured!r}"
  990. )
  991. # Dispatcher must not double-pass chunk_token_size as kwarg.
  992. assert "chunk_token_size" not in captured["kwargs"]
  993. @pytest.mark.offline
  994. def test_r_strategy_falls_back_to_global_chunk_size(tmp_path, monkeypatch):
  995. """When ``CHUNK_R_SIZE`` is unset and no per-doc R override is
  996. supplied, R inherits the top-level ``chunk_token_size`` resolved
  997. from the standard chain (here: ``LightRAG(chunk_token_size=…)``)."""
  998. monkeypatch.delenv("CHUNK_R_SIZE", raising=False)
  999. monkeypatch.delenv("CHUNK_SIZE", raising=False)
  1000. import lightrag.chunker as chunker_pkg
  1001. captured: dict = {}
  1002. def _r_spy(tokenizer, content, chunk_token_size, **kwargs):
  1003. captured["chunk_token_size"] = chunk_token_size
  1004. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  1005. monkeypatch.setattr(chunker_pkg, "chunking_by_recursive_character", _r_spy)
  1006. async def _run():
  1007. rag = _new_rag(tmp_path, chunk_token_size=444)
  1008. await rag.initialize_storages()
  1009. try:
  1010. await rag.apipeline_enqueue_documents(
  1011. "fallback body",
  1012. file_paths="ctor.[native-R].txt",
  1013. track_id="track-r-fallback",
  1014. process_options="R",
  1015. )
  1016. await rag.apipeline_process_enqueue_documents()
  1017. finally:
  1018. await rag.finalize_storages()
  1019. asyncio.run(_run())
  1020. assert captured.get("chunk_token_size") == 444
  1021. @pytest.mark.offline
  1022. def test_v_strategy_uses_dedicated_chunk_size_env(tmp_path, monkeypatch):
  1023. """``CHUNK_V_SIZE`` must give V its own ``chunk_token_size`` advisory
  1024. ceiling, decoupled from the global ``CHUNK_SIZE`` shared by F/R."""
  1025. monkeypatch.setenv("CHUNK_SIZE", "1200")
  1026. monkeypatch.setenv("CHUNK_V_SIZE", "2500")
  1027. import lightrag.chunker as chunker_pkg
  1028. captured: dict = {}
  1029. async def _v_spy(
  1030. tokenizer, content, chunk_token_size, *, embedding_func=None, **kwargs
  1031. ):
  1032. captured["chunk_token_size"] = chunk_token_size
  1033. captured["kwargs"] = dict(kwargs)
  1034. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  1035. monkeypatch.setattr(chunker_pkg, "chunking_by_semantic_vector", _v_spy)
  1036. async def _run():
  1037. rag = _new_rag(tmp_path)
  1038. await rag.initialize_storages()
  1039. try:
  1040. await rag.apipeline_enqueue_documents(
  1041. "stand-in body for semantic-vector chunker",
  1042. file_paths="ctor.[native-V].txt",
  1043. track_id="track-v-size",
  1044. process_options="V",
  1045. )
  1046. await rag.apipeline_process_enqueue_documents()
  1047. finally:
  1048. await rag.finalize_storages()
  1049. asyncio.run(_run())
  1050. assert captured.get("chunk_token_size") == 2500, (
  1051. "V chunker must receive CHUNK_V_SIZE-derived chunk_token_size, "
  1052. f"not the global CHUNK_SIZE; got {captured!r}"
  1053. )
  1054. # Dispatcher must not double-pass chunk_token_size as kwarg.
  1055. assert "chunk_token_size" not in captured["kwargs"]
  1056. @pytest.mark.offline
  1057. def test_v_strategy_falls_back_to_global_chunk_size(tmp_path, monkeypatch):
  1058. """When ``CHUNK_V_SIZE`` is unset and no per-doc V override is
  1059. supplied, V inherits the top-level ``chunk_token_size`` resolved
  1060. from the standard chain (here: ``LightRAG(chunk_token_size=…)``)."""
  1061. monkeypatch.delenv("CHUNK_V_SIZE", raising=False)
  1062. monkeypatch.delenv("CHUNK_SIZE", raising=False)
  1063. import lightrag.chunker as chunker_pkg
  1064. captured: dict = {}
  1065. async def _v_spy(
  1066. tokenizer, content, chunk_token_size, *, embedding_func=None, **kwargs
  1067. ):
  1068. captured["chunk_token_size"] = chunk_token_size
  1069. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  1070. monkeypatch.setattr(chunker_pkg, "chunking_by_semantic_vector", _v_spy)
  1071. async def _run():
  1072. rag = _new_rag(tmp_path, chunk_token_size=555)
  1073. await rag.initialize_storages()
  1074. try:
  1075. await rag.apipeline_enqueue_documents(
  1076. "fallback body",
  1077. file_paths="ctor.[native-V].txt",
  1078. track_id="track-v-fallback",
  1079. process_options="V",
  1080. )
  1081. await rag.apipeline_process_enqueue_documents()
  1082. finally:
  1083. await rag.finalize_storages()
  1084. asyncio.run(_run())
  1085. assert captured.get("chunk_token_size") == 555
  1086. @pytest.mark.offline
  1087. def test_f_strategy_honors_subdict_chunk_size(tmp_path, monkeypatch):
  1088. """After the F cleanup, F honors a per-doc
  1089. ``fixed_token.chunk_token_size`` override (caller-supplied
  1090. chunk_options) instead of being locked to the top-level/global size —
  1091. matching R/V/P. Pre-cleanup this slot could not exist: ``**f_opts``
  1092. would collide with the positional ``chunk_token_size`` and TypeError.
  1093. """
  1094. monkeypatch.setenv("CHUNK_SIZE", "1200")
  1095. import lightrag.chunker as chunker_pkg
  1096. captured: dict = {}
  1097. def _f_spy(tokenizer, content, chunk_token_size, **kwargs):
  1098. captured["chunk_token_size"] = chunk_token_size
  1099. captured["kwargs"] = dict(kwargs)
  1100. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  1101. monkeypatch.setattr(chunker_pkg, "chunking_by_fixed_token", _f_spy)
  1102. custom_options = {
  1103. # top-level global fallback — must be overridden by the sub-dict
  1104. "chunk_token_size": 1200,
  1105. "fixed_token": {
  1106. "chunk_token_size": 333,
  1107. "chunk_overlap_token_size": 7,
  1108. "split_by_character": None,
  1109. "split_by_character_only": False,
  1110. },
  1111. }
  1112. async def _run():
  1113. rag = _new_rag(tmp_path)
  1114. await rag.initialize_storages()
  1115. try:
  1116. await rag.apipeline_enqueue_documents(
  1117. "stand-in body for fixed-token chunker",
  1118. file_paths="ctor-f.txt",
  1119. track_id="track-f-size",
  1120. process_options="F",
  1121. chunk_options=custom_options,
  1122. )
  1123. await rag.apipeline_process_enqueue_documents()
  1124. finally:
  1125. await rag.finalize_storages()
  1126. asyncio.run(_run())
  1127. assert captured.get("chunk_token_size") == 333, (
  1128. "F chunker must receive the fixed_token.chunk_token_size override, "
  1129. f"not the top-level/global size; got {captured!r}"
  1130. )
  1131. # Dispatcher must pop it so it isn't also splatted as a kwarg (TypeError).
  1132. assert "chunk_token_size" not in captured["kwargs"]
  1133. assert captured["kwargs"]["chunk_overlap_token_size"] == 7
  1134. @pytest.mark.offline
  1135. def test_f_strategy_falls_back_to_top_level_size(tmp_path, monkeypatch):
  1136. """When the F sub-dict carries no ``chunk_token_size``, F still inherits
  1137. the top-level resolved size (here from ``LightRAG(chunk_token_size=…)``) —
  1138. the cleanup must not regress the existing global-size fallback."""
  1139. monkeypatch.delenv("CHUNK_SIZE", raising=False)
  1140. import lightrag.chunker as chunker_pkg
  1141. captured: dict = {}
  1142. def _f_spy(tokenizer, content, chunk_token_size, **kwargs):
  1143. captured["chunk_token_size"] = chunk_token_size
  1144. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  1145. monkeypatch.setattr(chunker_pkg, "chunking_by_fixed_token", _f_spy)
  1146. async def _run():
  1147. rag = _new_rag(tmp_path, chunk_token_size=456)
  1148. await rag.initialize_storages()
  1149. try:
  1150. await rag.apipeline_enqueue_documents(
  1151. "fallback body",
  1152. file_paths="ctor-f.txt",
  1153. track_id="track-f-fallback",
  1154. process_options="F",
  1155. )
  1156. await rag.apipeline_process_enqueue_documents()
  1157. finally:
  1158. await rag.finalize_storages()
  1159. asyncio.run(_run())
  1160. assert captured.get("chunk_token_size") == 456
  1161. @pytest.mark.offline
  1162. def test_f_strategy_uses_dedicated_chunk_size_env(tmp_path, monkeypatch):
  1163. """``CHUNK_F_SIZE`` gives F its own ``chunk_token_size``, decoupled from
  1164. the global ``CHUNK_SIZE`` shared as the fallback — symmetric with
  1165. ``CHUNK_R_SIZE`` / ``CHUNK_V_SIZE``."""
  1166. monkeypatch.setenv("CHUNK_SIZE", "1200")
  1167. monkeypatch.setenv("CHUNK_F_SIZE", "777")
  1168. import lightrag.chunker as chunker_pkg
  1169. captured: dict = {}
  1170. def _f_spy(tokenizer, content, chunk_token_size, **kwargs):
  1171. captured["chunk_token_size"] = chunk_token_size
  1172. captured["kwargs"] = dict(kwargs)
  1173. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  1174. monkeypatch.setattr(chunker_pkg, "chunking_by_fixed_token", _f_spy)
  1175. async def _run():
  1176. rag = _new_rag(tmp_path)
  1177. await rag.initialize_storages()
  1178. try:
  1179. await rag.apipeline_enqueue_documents(
  1180. "stand-in body for fixed-token chunker",
  1181. file_paths="ctor-f.txt",
  1182. track_id="track-f-size",
  1183. process_options="F",
  1184. )
  1185. await rag.apipeline_process_enqueue_documents()
  1186. finally:
  1187. await rag.finalize_storages()
  1188. asyncio.run(_run())
  1189. assert captured.get("chunk_token_size") == 777, (
  1190. "F chunker must receive CHUNK_F_SIZE-derived chunk_token_size, "
  1191. f"not the global CHUNK_SIZE; got {captured!r}"
  1192. )
  1193. # Dispatcher must not double-pass chunk_token_size as kwarg.
  1194. assert "chunk_token_size" not in captured["kwargs"]
  1195. @pytest.mark.offline
  1196. def test_f_strategy_env_size_wins_over_legacy_ctor_field(tmp_path, monkeypatch):
  1197. """Specificity-ordered precedence: ``CHUNK_F_SIZE`` (strategy env, tier 2)
  1198. beats the strategy-agnostic legacy constructor field (tier 3)."""
  1199. monkeypatch.setenv("CHUNK_F_SIZE", "640")
  1200. monkeypatch.delenv("CHUNK_SIZE", raising=False)
  1201. import lightrag.chunker as chunker_pkg
  1202. captured: dict = {}
  1203. def _f_spy(tokenizer, content, chunk_token_size, **kwargs):
  1204. captured["chunk_token_size"] = chunk_token_size
  1205. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  1206. monkeypatch.setattr(chunker_pkg, "chunking_by_fixed_token", _f_spy)
  1207. async def _run():
  1208. rag = _new_rag(tmp_path, chunk_token_size=999)
  1209. await rag.initialize_storages()
  1210. try:
  1211. await rag.apipeline_enqueue_documents(
  1212. "precedence body",
  1213. file_paths="ctor-f.txt",
  1214. track_id="track-f-prec",
  1215. process_options="F",
  1216. )
  1217. await rag.apipeline_process_enqueue_documents()
  1218. finally:
  1219. await rag.finalize_storages()
  1220. asyncio.run(_run())
  1221. assert captured.get("chunk_token_size") == 640
  1222. @pytest.mark.offline
  1223. def test_ainsert_legacy_path_honors_f_size_env(tmp_path, monkeypatch):
  1224. """``rag.ainsert()`` intentionally does NOT pass a ``process_options``
  1225. selector, so it runs the legacy ``chunking_func`` branch (preserving any
  1226. user-supplied chunking_func). That branch must still honor ``CHUNK_F_SIZE``
  1227. (i.e. ``fixed_token.chunk_token_size``) instead of only the global
  1228. ``CHUNK_SIZE`` — otherwise the SDK path would silently ignore it.
  1229. """
  1230. monkeypatch.setenv("CHUNK_SIZE", "1200")
  1231. monkeypatch.setenv("CHUNK_F_SIZE", "640")
  1232. captured: dict = {}
  1233. def _chunking_func_spy(
  1234. tokenizer,
  1235. content,
  1236. split_by_character,
  1237. split_by_character_only,
  1238. overlap,
  1239. chunk_token_size,
  1240. ):
  1241. captured["chunk_token_size"] = chunk_token_size
  1242. return [{"tokens": 5, "content": "stub", "chunk_order_index": 0}]
  1243. async def _run():
  1244. rag = _new_rag(tmp_path)
  1245. # Override the legacy 6-arg chunking_func to observe the size it gets.
  1246. rag.chunking_func = _chunking_func_spy
  1247. await rag.initialize_storages()
  1248. try:
  1249. await rag.ainsert("legacy path body", file_paths="legacy-f.txt")
  1250. finally:
  1251. await rag.finalize_storages()
  1252. asyncio.run(_run())
  1253. assert captured.get("chunk_token_size") == 640, (
  1254. "ainsert legacy chunking_func must receive CHUNK_F_SIZE-derived size, "
  1255. f"not the global CHUNK_SIZE; got {captured!r}"
  1256. )
  1257. @pytest.mark.offline
  1258. def test_partial_chunker_config_still_picks_up_size_env(tmp_path, monkeypatch):
  1259. """A partial ``addon_params['chunker']`` skips ``default_chunker_config``
  1260. (``normalize_addon_params`` only defaults the whole ``chunker`` key when
  1261. absent), so ``_apply_chunk_size_overlay`` must mirror the strategy
  1262. size-env seeding — otherwise ``CHUNK_F_SIZE`` / ``CHUNK_R_SIZE`` /
  1263. ``CHUNK_V_SIZE`` are silently ignored for partial configs.
  1264. """
  1265. monkeypatch.setenv("CHUNK_F_SIZE", "640")
  1266. monkeypatch.setenv("CHUNK_R_SIZE", "777")
  1267. monkeypatch.setenv("CHUNK_V_SIZE", "888")
  1268. # Partial config: only F's split_by_character is supplied; every
  1269. # chunk_token_size slot is absent and must be backfilled from env.
  1270. rag = _new_rag(
  1271. tmp_path,
  1272. addon_params={"chunker": {"fixed_token": {"split_by_character": "\n"}}},
  1273. )
  1274. chunker = rag.addon_params["chunker"]
  1275. assert chunker["fixed_token"]["chunk_token_size"] == 640
  1276. # Explicit caller value preserved alongside the env-backfilled size.
  1277. assert chunker["fixed_token"]["split_by_character"] == "\n"
  1278. assert chunker["recursive_character"]["chunk_token_size"] == 777
  1279. assert chunker["semantic_vector"]["chunk_token_size"] == 888
  1280. @pytest.mark.offline
  1281. def test_partial_chunker_config_explicit_size_beats_env(tmp_path, monkeypatch):
  1282. """An explicit ``fixed_token.chunk_token_size`` in a partial config wins
  1283. over ``CHUNK_F_SIZE`` (tier 1 > tier 2)."""
  1284. monkeypatch.setenv("CHUNK_F_SIZE", "640")
  1285. rag = _new_rag(
  1286. tmp_path,
  1287. addon_params={"chunker": {"fixed_token": {"chunk_token_size": 320}}},
  1288. )
  1289. assert rag.addon_params["chunker"]["fixed_token"]["chunk_token_size"] == 320
  1290. @pytest.mark.offline
  1291. def test_partial_chunker_config_no_size_env_leaves_slot_absent(tmp_path, monkeypatch):
  1292. """Without a size env, the slot stays absent so the strategy inherits the
  1293. top-level chunk_token_size at consumption time (no behavior change)."""
  1294. monkeypatch.delenv("CHUNK_F_SIZE", raising=False)
  1295. monkeypatch.delenv("CHUNK_R_SIZE", raising=False)
  1296. rag = _new_rag(
  1297. tmp_path,
  1298. addon_params={"chunker": {"recursive_character": {"separators": ["X"]}}},
  1299. )
  1300. chunker = rag.addon_params["chunker"]
  1301. assert "chunk_token_size" not in chunker["recursive_character"]
  1302. assert "chunk_token_size" not in chunker["fixed_token"]