test_multimodal_surrounding_context.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893
  1. """Unit tests for the native multimodal surrounding-context extractor.
  2. See ``docs/NativeMultimodalSurroundingContextPlan-zh.md``.
  3. These tests use a 1:1 character-token mapping so the expected token
  4. budgets in each scenario stay obvious without coupling to tiktoken's
  5. BPE. The helper functions exercised here are pure (no async, no
  6. network), so the suite runs offline.
  7. """
  8. import json
  9. import pytest
  10. from lightrag.multimodal_context import (
  11. build_surrounding,
  12. enrich_sidecars_with_surrounding,
  13. find_target_span,
  14. load_chunk_separators,
  15. )
  16. from lightrag.utils import Tokenizer, TokenizerInterface
  17. class _CharTokenizer(TokenizerInterface):
  18. def encode(self, content: str):
  19. return [ord(ch) for ch in content]
  20. def decode(self, tokens):
  21. return "".join(chr(t) for t in tokens)
  22. def _tokenizer() -> Tokenizer:
  23. return Tokenizer(model_name="char", tokenizer=_CharTokenizer())
  24. # ---------------------------------------------------------------------------
  25. # Target-tag locator
  26. # ---------------------------------------------------------------------------
  27. @pytest.mark.offline
  28. def test_find_target_span_drawing_in_mixed_content():
  29. content = (
  30. "leading text. "
  31. '<drawing id="im-abcd-0001" format="png" path="img.png" src="img" /> '
  32. "trailing text."
  33. )
  34. span = find_target_span("drawings", "im-abcd-0001", content)
  35. assert span is not None
  36. start, end = span
  37. assert content[start:end].startswith('<drawing id="im-abcd-0001"')
  38. assert content[start:end].endswith("/>")
  39. @pytest.mark.offline
  40. def test_find_target_span_table_with_id_anywhere_in_attrs():
  41. # id is not first attribute — locator must still find it.
  42. content = (
  43. 'before <table format="json" id="tb-abcd-0007">[[1,2],[3,4]]</table> after'
  44. )
  45. span = find_target_span("tables", "tb-abcd-0007", content)
  46. assert span is not None
  47. snippet = content[span[0] : span[1]]
  48. assert snippet.endswith("</table>")
  49. assert 'id="tb-abcd-0007"' in snippet
  50. @pytest.mark.offline
  51. def test_find_target_span_table_cite_marker():
  52. content = 'before <cite type="table" refid="tb-abcd-0007">表1</cite> after'
  53. span = find_target_span("tables", "tb-abcd-0007", content)
  54. assert span is not None
  55. assert content[span[0] : span[1]].startswith("<cite")
  56. @pytest.mark.offline
  57. def test_find_target_span_equation():
  58. content = 'A <equation id="eq-abcd-0002" format="latex">x^2</equation> B'
  59. span = find_target_span("equations", "eq-abcd-0002", content)
  60. assert span is not None
  61. assert content[span[0] : span[1]].endswith("</equation>")
  62. @pytest.mark.offline
  63. def test_find_target_span_unknown_id_returns_none():
  64. content = '<drawing id="im-1" />'
  65. assert find_target_span("drawings", "im-other", content) is None
  66. # ---------------------------------------------------------------------------
  67. # Drawings & equations surrounding
  68. # ---------------------------------------------------------------------------
  69. @pytest.mark.offline
  70. def test_drawing_surrounding_kept_within_block_only():
  71. tok = _tokenizer()
  72. block = (
  73. "paragraph one ends. paragraph two. "
  74. '<drawing id="im-1" path="a.png" src="a" /> '
  75. "paragraph three. paragraph four."
  76. )
  77. span = find_target_span("drawings", "im-1", block)
  78. surr = build_surrounding(
  79. kind="drawings",
  80. block_content=block,
  81. span=span,
  82. tokenizer=tok,
  83. leading_max_tokens=2000,
  84. trailing_max_tokens=2000,
  85. separators=load_chunk_separators(),
  86. )
  87. assert surr["leading"].endswith("paragraph two. ")
  88. assert surr["trailing"].startswith(" paragraph three.")
  89. @pytest.mark.offline
  90. def test_equation_surrounding_protects_drawing_atom():
  91. tok = _tokenizer()
  92. block = (
  93. '<drawing id="im-prev" path="a.png" src="a" caption="Fig 1" />'
  94. " intro text. "
  95. '<equation id="eq-1" format="latex">a+b=c</equation>'
  96. " conclusion text."
  97. )
  98. span = find_target_span("equations", "eq-1", block)
  99. surr = build_surrounding(
  100. kind="equations",
  101. block_content=block,
  102. span=span,
  103. tokenizer=tok,
  104. leading_max_tokens=2000,
  105. trailing_max_tokens=2000,
  106. separators=load_chunk_separators(),
  107. )
  108. # Parser-internal id/path/src are stripped, but caption survives and
  109. # the drawing tag stays atomic (not cut in half).
  110. assert '<drawing caption="Fig 1" />' in surr["leading"]
  111. assert "/>" in surr["leading"]
  112. # No half-open drawing/equation tags
  113. assert surr["leading"].count("<drawing") == surr["leading"].count("/>")
  114. # ---------------------------------------------------------------------------
  115. # Tables surrounding: other tables must be stripped before token counting.
  116. # ---------------------------------------------------------------------------
  117. @pytest.mark.offline
  118. def test_table_surrounding_strips_other_tables_before_counting():
  119. tok = _tokenizer()
  120. block = (
  121. '<table id="tb-other" format="json">[["a","b"],["c","d"]]</table> '
  122. "narrative text describing the report. "
  123. '<table id="tb-target" format="json">[["x","y"]]</table>'
  124. " concluding remarks."
  125. )
  126. span = find_target_span("tables", "tb-target", block)
  127. surr = build_surrounding(
  128. kind="tables",
  129. block_content=block,
  130. span=span,
  131. tokenizer=tok,
  132. leading_max_tokens=2000,
  133. trailing_max_tokens=2000,
  134. separators=load_chunk_separators(),
  135. )
  136. # Sibling table must NOT appear in surrounding.
  137. assert "<table" not in surr["leading"]
  138. assert "</table>" not in surr["leading"]
  139. assert "<table" not in surr["trailing"]
  140. assert "narrative text" in surr["leading"]
  141. assert "concluding remarks" in surr["trailing"]
  142. @pytest.mark.offline
  143. def test_table_surrounding_supports_cite_marker_and_strips_sibling_cites():
  144. tok = _tokenizer()
  145. block = (
  146. 'prefix <cite type="table" refid="tb-other">表0</cite> '
  147. 'narrative <cite type="table" refid="tb-target">表1</cite> suffix'
  148. )
  149. span = find_target_span("tables", "tb-target", block)
  150. surr = build_surrounding(
  151. kind="tables",
  152. block_content=block,
  153. span=span,
  154. tokenizer=tok,
  155. leading_max_tokens=2000,
  156. trailing_max_tokens=2000,
  157. separators=load_chunk_separators(),
  158. )
  159. assert "tb-other" not in surr["leading"]
  160. assert "表0" not in surr["leading"]
  161. assert "narrative " in surr["leading"]
  162. assert surr["trailing"] == " suffix"
  163. # ---------------------------------------------------------------------------
  164. # Custom CHUNK_R_SEPARATORS via env
  165. # ---------------------------------------------------------------------------
  166. @pytest.mark.offline
  167. def test_chunk_r_separators_env_drives_segment_boundary(monkeypatch):
  168. # Only the pipe character is a separator: text must split at '|'.
  169. monkeypatch.setenv("CHUNK_R_SEPARATORS", json.dumps(["|"]))
  170. seps = load_chunk_separators()
  171. assert seps == ["|"]
  172. tok = _tokenizer()
  173. # 3 segments separated by '|'; budget = 12 chars/tokens; each seg is
  174. # 10 chars including the trailing '|', so 1 whole segment fits, 2 do not.
  175. block = 'aaaaaaaaa|bbbbbbbbb|<drawing id="d" />|ccccccccc|ddddddddd'
  176. span = find_target_span("drawings", "d", block)
  177. surr = build_surrounding(
  178. kind="drawings",
  179. block_content=block,
  180. span=span,
  181. tokenizer=tok,
  182. leading_max_tokens=12,
  183. trailing_max_tokens=12,
  184. separators=seps,
  185. )
  186. # Leading should end at a '|' boundary (one whole segment), not be
  187. # char-truncated.
  188. assert surr["leading"].endswith("|")
  189. # And contain whole segment closest to target.
  190. assert "bbbbbbbbb|" in surr["leading"]
  191. # ---------------------------------------------------------------------------
  192. # Char fallback when the closest segment alone exceeds the budget.
  193. # ---------------------------------------------------------------------------
  194. @pytest.mark.offline
  195. def test_oversized_closest_segment_char_truncated():
  196. tok = _tokenizer()
  197. # Single huge "segment" (no separator) right before the target.
  198. big = "X" * 5000
  199. block = big + '<drawing id="d" />'
  200. span = find_target_span("drawings", "d", block)
  201. surr = build_surrounding(
  202. kind="drawings",
  203. block_content=block,
  204. span=span,
  205. tokenizer=tok,
  206. leading_max_tokens=200,
  207. trailing_max_tokens=200,
  208. separators=load_chunk_separators(),
  209. )
  210. assert len(tok.encode(surr["leading"])) <= 200
  211. assert surr["trailing"] == ""
  212. # The suffix should be a tail of the X-run.
  213. assert surr["leading"].endswith("X")
  214. @pytest.mark.offline
  215. def test_oversized_trailing_char_truncated_at_head():
  216. tok = _tokenizer()
  217. big = "Y" * 5000
  218. block = '<drawing id="d" />' + big
  219. span = find_target_span("drawings", "d", block)
  220. surr = build_surrounding(
  221. kind="drawings",
  222. block_content=block,
  223. span=span,
  224. tokenizer=tok,
  225. leading_max_tokens=200,
  226. trailing_max_tokens=200,
  227. separators=load_chunk_separators(),
  228. )
  229. assert len(tok.encode(surr["trailing"])) <= 200
  230. assert surr["trailing"].startswith("Y")
  231. # ---------------------------------------------------------------------------
  232. # Drawings/equations surrounding: JSON / HTML table row trimming.
  233. # ---------------------------------------------------------------------------
  234. @pytest.mark.offline
  235. def test_drawing_surrounding_row_trims_oversized_json_table():
  236. tok = _tokenizer()
  237. # 10 rows of repeating cells; whole table is ~> budget.
  238. rows = [[f"r{i}c0", f"r{i}c1"] for i in range(10)]
  239. big_table = '<table id="tb-big" format="json">' + json.dumps(rows) + "</table>"
  240. block = big_table + ' <drawing id="d" />'
  241. span = find_target_span("drawings", "d", block)
  242. # Budget chosen so only a few rows of the JSON table fit.
  243. surr = build_surrounding(
  244. kind="drawings",
  245. block_content=block,
  246. span=span,
  247. tokenizer=tok,
  248. leading_max_tokens=80,
  249. trailing_max_tokens=80,
  250. separators=load_chunk_separators(),
  251. )
  252. # Result must be a complete (smaller) <table>...</table>, contain
  253. # closing tag, and fit within budget.
  254. leading = surr["leading"]
  255. assert "<table " in leading
  256. assert (
  257. leading.rstrip().endswith("</table>")
  258. or leading.rstrip().endswith("</table> ")
  259. or "</table>" in leading
  260. )
  261. assert len(tok.encode(leading)) <= 80
  262. # Should keep tail rows (closest to target — last rows by index)
  263. assert "r9c0" in leading
  264. # Should not include rows from the far side.
  265. assert "r0c0" not in leading
  266. @pytest.mark.offline
  267. def test_drawing_surrounding_row_trims_oversized_html_table():
  268. tok = _tokenizer()
  269. rows_html = "".join(f"<tr><td>r{i}c0</td><td>r{i}c1</td></tr>" for i in range(10))
  270. body = f"<tbody>{rows_html}</tbody>"
  271. big_table = f'<table id="tb-h" format="html">{body}</table>'
  272. block = f'<drawing id="d" /> {big_table}'
  273. span = find_target_span("drawings", "d", block)
  274. surr = build_surrounding(
  275. kind="drawings",
  276. block_content=block,
  277. span=span,
  278. tokenizer=tok,
  279. leading_max_tokens=120,
  280. trailing_max_tokens=120,
  281. separators=load_chunk_separators(),
  282. )
  283. trailing = surr["trailing"]
  284. assert "<table " in trailing
  285. assert "</table>" in trailing
  286. assert "<tbody>" in trailing
  287. assert "</tbody>" in trailing
  288. assert len(tok.encode(trailing)) <= 120
  289. # For trailing we keep head rows.
  290. assert "r0c0" in trailing
  291. assert "r9c0" not in trailing
  292. @pytest.mark.offline
  293. def test_drawing_surrounding_char_trims_oversized_single_json_row():
  294. tok = _tokenizer()
  295. row_text = "A" * 200 + "TAIL"
  296. big_table = (
  297. '<table id="tb-big" format="json">'
  298. + json.dumps([[row_text]], ensure_ascii=False)
  299. + "</table>"
  300. )
  301. block = big_table + '<drawing id="d" />'
  302. span = find_target_span("drawings", "d", block)
  303. surr = build_surrounding(
  304. kind="drawings",
  305. block_content=block,
  306. span=span,
  307. tokenizer=tok,
  308. leading_max_tokens=90,
  309. trailing_max_tokens=90,
  310. separators=load_chunk_separators(),
  311. )
  312. leading = surr["leading"]
  313. assert leading.startswith("<table ")
  314. assert leading.endswith("</table>")
  315. assert "TAIL" in leading
  316. assert len(tok.encode(leading)) <= 90
  317. body = leading[leading.index(">") + 1 : -len("</table>")]
  318. parsed = json.loads(body)
  319. assert isinstance(parsed, list)
  320. @pytest.mark.offline
  321. def test_drawing_surrounding_char_trims_oversized_single_html_row():
  322. tok = _tokenizer()
  323. row_text = "HEAD" + "B" * 200
  324. big_table = (
  325. '<table id="tb-h" format="html">'
  326. f"<tbody><tr><td>{row_text}</td></tr></tbody>"
  327. "</table>"
  328. )
  329. block = f'<drawing id="d" />{big_table}'
  330. span = find_target_span("drawings", "d", block)
  331. surr = build_surrounding(
  332. kind="drawings",
  333. block_content=block,
  334. span=span,
  335. tokenizer=tok,
  336. leading_max_tokens=100,
  337. trailing_max_tokens=100,
  338. separators=load_chunk_separators(),
  339. )
  340. trailing = surr["trailing"]
  341. assert trailing.startswith("<table ")
  342. assert trailing.endswith("</table>")
  343. assert "<tr><td>" in trailing
  344. assert "HEAD" in trailing
  345. assert len(tok.encode(trailing)) <= 100
  346. # ---------------------------------------------------------------------------
  347. # enrich_sidecars_with_surrounding: idempotency + modality gating.
  348. # ---------------------------------------------------------------------------
  349. def _write_blocks(tmp_path, base, blocks):
  350. blocks_path = tmp_path / f"{base}.blocks.jsonl"
  351. lines = [json.dumps({"type": "meta", "format": "lightrag"})]
  352. for b in blocks:
  353. lines.append(json.dumps(b, ensure_ascii=False))
  354. blocks_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
  355. return blocks_path
  356. def _write_sidecar(path, root_key, items):
  357. path.write_text(
  358. json.dumps(
  359. {"version": "1.0", root_key: items},
  360. ensure_ascii=False,
  361. indent=2,
  362. ),
  363. encoding="utf-8",
  364. )
  365. @pytest.mark.offline
  366. def test_enrich_only_updates_enabled_modalities(tmp_path):
  367. base = "doc"
  368. blockid = "b1"
  369. content = (
  370. "intro. "
  371. '<drawing id="im-1" path="a.png" src="a" />'
  372. " middle "
  373. '<table id="tb-1" format="json">[["a"]]</table>'
  374. " tail "
  375. '<equation id="eq-1" format="latex">e</equation>'
  376. " end."
  377. )
  378. _write_blocks(
  379. tmp_path,
  380. base,
  381. [
  382. {
  383. "type": "content",
  384. "blockid": blockid,
  385. "format": "plain_text",
  386. "content": content,
  387. "heading": "h",
  388. "parent_headings": [],
  389. "level": 1,
  390. }
  391. ],
  392. )
  393. drawings_path = tmp_path / f"{base}.drawings.json"
  394. tables_path = tmp_path / f"{base}.tables.json"
  395. equations_path = tmp_path / f"{base}.equations.json"
  396. _write_sidecar(
  397. drawings_path,
  398. "drawings",
  399. {"im-1": {"id": "im-1", "blockid": blockid, "heading": "h"}},
  400. )
  401. _write_sidecar(
  402. tables_path,
  403. "tables",
  404. {"tb-1": {"id": "tb-1", "blockid": blockid, "heading": "h"}},
  405. )
  406. _write_sidecar(
  407. equations_path,
  408. "equations",
  409. {"eq-1": {"id": "eq-1", "blockid": blockid, "heading": "h"}},
  410. )
  411. counts = enrich_sidecars_with_surrounding(
  412. blocks_path=str(tmp_path / f"{base}.blocks.jsonl"),
  413. enabled_modalities={"drawings"},
  414. tokenizer=_tokenizer(),
  415. leading_max_tokens=2000,
  416. trailing_max_tokens=2000,
  417. )
  418. assert counts["drawings"] == 1
  419. assert counts["tables"] == 0
  420. assert counts["equations"] == 0
  421. drawings = json.loads(drawings_path.read_text(encoding="utf-8"))
  422. tables = json.loads(tables_path.read_text(encoding="utf-8"))
  423. equations = json.loads(equations_path.read_text(encoding="utf-8"))
  424. assert "surrounding" in drawings["drawings"]["im-1"]
  425. assert drawings["drawings"]["im-1"]["surrounding"]["leading"].startswith("intro.")
  426. assert "surrounding" not in tables["tables"]["tb-1"]
  427. assert "surrounding" not in equations["equations"]["eq-1"]
  428. @pytest.mark.offline
  429. def test_enrich_runs_even_when_llm_analyze_result_present(tmp_path):
  430. """Idempotency: existing ``llm_analyze_result`` does not block
  431. surrounding backfill — we treat the two fields as independent."""
  432. base = "doc"
  433. blockid = "b1"
  434. content = 'prefix. <drawing id="im-1" path="a.png" src="a" /> suffix.'
  435. _write_blocks(
  436. tmp_path,
  437. base,
  438. [
  439. {
  440. "type": "content",
  441. "blockid": blockid,
  442. "format": "plain_text",
  443. "content": content,
  444. "heading": "h",
  445. "parent_headings": [],
  446. "level": 1,
  447. }
  448. ],
  449. )
  450. drawings_path = tmp_path / f"{base}.drawings.json"
  451. _write_sidecar(
  452. drawings_path,
  453. "drawings",
  454. {
  455. "im-1": {
  456. "id": "im-1",
  457. "blockid": blockid,
  458. "heading": "h",
  459. "llm_analyze_result": {
  460. "name": "x",
  461. "summary": "",
  462. "detail_description": "",
  463. },
  464. }
  465. },
  466. )
  467. counts = enrich_sidecars_with_surrounding(
  468. blocks_path=str(tmp_path / f"{base}.blocks.jsonl"),
  469. enabled_modalities={"drawings"},
  470. tokenizer=_tokenizer(),
  471. leading_max_tokens=2000,
  472. trailing_max_tokens=2000,
  473. )
  474. assert counts["drawings"] == 1
  475. payload = json.loads(drawings_path.read_text(encoding="utf-8"))
  476. item = payload["drawings"]["im-1"]
  477. assert item["llm_analyze_result"]["name"] == "x" # untouched
  478. assert item["surrounding"]["leading"].startswith("prefix.")
  479. assert item["surrounding"]["trailing"].startswith(" suffix.")
  480. @pytest.mark.offline
  481. def test_enrich_does_not_cross_block_boundaries(tmp_path):
  482. base = "doc"
  483. block_a = "earlier block content."
  484. block_b = 'later block. <drawing id="im-1" path="a.png" src="a" /> tail.'
  485. _write_blocks(
  486. tmp_path,
  487. base,
  488. [
  489. {
  490. "type": "content",
  491. "blockid": "bA",
  492. "format": "plain_text",
  493. "content": block_a,
  494. "heading": "h1",
  495. "parent_headings": [],
  496. "level": 1,
  497. },
  498. {
  499. "type": "content",
  500. "blockid": "bB",
  501. "format": "plain_text",
  502. "content": block_b,
  503. "heading": "h2",
  504. "parent_headings": [],
  505. "level": 1,
  506. },
  507. ],
  508. )
  509. drawings_path = tmp_path / f"{base}.drawings.json"
  510. _write_sidecar(
  511. drawings_path,
  512. "drawings",
  513. {"im-1": {"id": "im-1", "blockid": "bB", "heading": "h2"}},
  514. )
  515. enrich_sidecars_with_surrounding(
  516. blocks_path=str(tmp_path / f"{base}.blocks.jsonl"),
  517. enabled_modalities={"drawings"},
  518. tokenizer=_tokenizer(),
  519. leading_max_tokens=2000,
  520. trailing_max_tokens=2000,
  521. )
  522. payload = json.loads(drawings_path.read_text(encoding="utf-8"))
  523. surr = payload["drawings"]["im-1"]["surrounding"]
  524. # Must come from block B only — content of block A absent.
  525. assert "earlier block content" not in surr["leading"]
  526. assert surr["leading"].startswith("later block.")
  527. # ---------------------------------------------------------------------------
  528. # Per-half token budgets via SURROUNDING_LEADING/TRAILING_MAX_TOKENS env vars.
  529. # ---------------------------------------------------------------------------
  530. @pytest.mark.offline
  531. def test_env_var_leading_and_trailing_budgets_apply_independently(
  532. tmp_path, monkeypatch
  533. ):
  534. # Asymmetric budgets must produce asymmetric leading / trailing sizes.
  535. monkeypatch.setenv("SURROUNDING_LEADING_MAX_TOKENS", "5")
  536. monkeypatch.setenv("SURROUNDING_TRAILING_MAX_TOKENS", "20")
  537. base = "doc"
  538. blockid = "b1"
  539. content = "X" * 200 + '<drawing id="im-1" path="a.png" src="a" />' + "Y" * 200
  540. _write_blocks(
  541. tmp_path,
  542. base,
  543. [
  544. {
  545. "type": "content",
  546. "blockid": blockid,
  547. "format": "plain_text",
  548. "content": content,
  549. "heading": "h",
  550. "parent_headings": [],
  551. "level": 1,
  552. }
  553. ],
  554. )
  555. drawings_path = tmp_path / f"{base}.drawings.json"
  556. _write_sidecar(
  557. drawings_path,
  558. "drawings",
  559. {"im-1": {"id": "im-1", "blockid": blockid, "heading": "h"}},
  560. )
  561. tok = _tokenizer()
  562. enrich_sidecars_with_surrounding(
  563. blocks_path=str(tmp_path / f"{base}.blocks.jsonl"),
  564. enabled_modalities={"drawings"},
  565. tokenizer=tok,
  566. )
  567. surr = json.loads(drawings_path.read_text(encoding="utf-8"))["drawings"]["im-1"][
  568. "surrounding"
  569. ]
  570. assert len(tok.encode(surr["leading"])) <= 5
  571. assert len(tok.encode(surr["trailing"])) <= 20
  572. # Trailing is allowed to use its larger budget, so it must be strictly
  573. # longer than leading here.
  574. assert len(surr["trailing"]) > len(surr["leading"])
  575. # ---------------------------------------------------------------------------
  576. # Parser-internal markup stripping inside surrounding (mirrors what
  577. # ``strip_internal_multimodal_markup_for_extraction`` does for chunk
  578. # content before entity extraction). The cleaning happens *before*
  579. # token-budgeted truncation, so the saved budget reflects what the
  580. # LLM actually receives and a truncation point can never land inside
  581. # an unprocessed ``id="…"`` attribute.
  582. # ---------------------------------------------------------------------------
  583. @pytest.mark.offline
  584. def test_surrounding_strips_drawing_id_path_src():
  585. tok = _tokenizer()
  586. block = (
  587. "leading prose. "
  588. '<drawing id="im-x" path="figs/a.png" src="raw/a.png" caption="Fig 1" />'
  589. " between. "
  590. '<equation id="eq-target" format="latex">x=1</equation>'
  591. " trailing prose."
  592. )
  593. span = find_target_span("equations", "eq-target", block)
  594. surr = build_surrounding(
  595. kind="equations",
  596. block_content=block,
  597. span=span,
  598. tokenizer=tok,
  599. leading_max_tokens=2000,
  600. trailing_max_tokens=2000,
  601. separators=load_chunk_separators(),
  602. )
  603. leading = surr["leading"]
  604. assert '<drawing caption="Fig 1" />' in leading
  605. assert 'id="im-x"' not in leading
  606. assert "path=" not in leading
  607. assert "src=" not in leading
  608. @pytest.mark.offline
  609. def test_surrounding_strips_table_internal_id():
  610. tok = _tokenizer()
  611. block = (
  612. "prefix. "
  613. '<table id="tb-x" format="json" caption="Sales">[[1,2],[3,4]]</table>'
  614. " between. "
  615. '<drawing id="im-target" caption="Fig 2" />'
  616. " suffix."
  617. )
  618. span = find_target_span("drawings", "im-target", block)
  619. surr = build_surrounding(
  620. kind="drawings",
  621. block_content=block,
  622. span=span,
  623. tokenizer=tok,
  624. leading_max_tokens=2000,
  625. trailing_max_tokens=2000,
  626. separators=load_chunk_separators(),
  627. )
  628. leading = surr["leading"]
  629. assert '<table format="json" caption="Sales">[[1,2],[3,4]]</table>' in leading
  630. assert 'id="tb-x"' not in leading
  631. @pytest.mark.offline
  632. def test_surrounding_strips_cite_refid_keeping_visible_text():
  633. tok = _tokenizer()
  634. block = (
  635. "see "
  636. '<cite type="table" refid="tb-x">Table 1</cite>'
  637. " for details. "
  638. '<drawing id="im-target" caption="Fig 3" />'
  639. " end."
  640. )
  641. span = find_target_span("drawings", "im-target", block)
  642. surr = build_surrounding(
  643. kind="drawings",
  644. block_content=block,
  645. span=span,
  646. tokenizer=tok,
  647. leading_max_tokens=2000,
  648. trailing_max_tokens=2000,
  649. separators=load_chunk_separators(),
  650. )
  651. leading = surr["leading"]
  652. # Surrounding path uses keep_cite_tag=True: the cite wrapper survives
  653. # (so the VLM/LLM can tell "Table 1" is a reference to an external
  654. # table, not inline prose) but the parser-internal refid is gone.
  655. assert '<cite type="table">Table 1</cite>' in leading
  656. assert "refid=" not in leading
  657. assert "tb-x" not in leading
  658. @pytest.mark.offline
  659. def test_surrounding_keeps_equation_cite_tag_and_strips_refid():
  660. """In production, equations without LaTeX content emit as
  661. ``<cite type="equation" refid="eq-…">公式 N</cite>`` rather than a
  662. full ``<equation>`` tag. Surrounding must keep the wrapper so the
  663. multimodal analyzer can recognize the visible label as an external
  664. referent, not inline prose."""
  665. tok = _tokenizer()
  666. block = (
  667. "see "
  668. '<cite type="equation" refid="eq-y">公式 2</cite>'
  669. " above. "
  670. '<drawing id="im-target" caption="Fig 4" />'
  671. " end."
  672. )
  673. span = find_target_span("drawings", "im-target", block)
  674. surr = build_surrounding(
  675. kind="drawings",
  676. block_content=block,
  677. span=span,
  678. tokenizer=tok,
  679. leading_max_tokens=2000,
  680. trailing_max_tokens=2000,
  681. separators=load_chunk_separators(),
  682. )
  683. leading = surr["leading"]
  684. assert '<cite type="equation">公式 2</cite>' in leading
  685. assert "refid=" not in leading
  686. assert "eq-y" not in leading
  687. @pytest.mark.offline
  688. def test_strip_happens_before_budget_truncation():
  689. """Regression guard for the strip-before-truncate ordering.
  690. Constructs a leading source whose raw form (with id/path/src) exceeds
  691. the budget while its stripped form fits. If strip ran *after*
  692. truncation, the budget would be measured against the bloated raw
  693. string and the saved surrounding would be cut early (possibly mid-
  694. attribute, leaving ``id="…`` residue).
  695. """
  696. tok = _tokenizer()
  697. # Raw drawing tag including attrs (~67 chars), stripped form is
  698. # just '<drawing caption="C" />' (~24 chars). Budget at 30 sits
  699. # between the two — raw is too big, stripped fits.
  700. block = (
  701. '<drawing id="im-prev" path="some/long/path.png" src="raw/long/path.png"'
  702. ' caption="C" />'
  703. '<equation id="eq-1" format="latex">y</equation>'
  704. " tail."
  705. )
  706. span = find_target_span("equations", "eq-1", block)
  707. surr = build_surrounding(
  708. kind="equations",
  709. block_content=block,
  710. span=span,
  711. tokenizer=tok,
  712. leading_max_tokens=30,
  713. trailing_max_tokens=2000,
  714. separators=load_chunk_separators(),
  715. )
  716. leading = surr["leading"]
  717. # Whole stripped tag must be present — proves strip ran before
  718. # the budget gate.
  719. assert leading == '<drawing caption="C" />'
  720. # And no parser-internal markers leaked through.
  721. assert "id=" not in leading
  722. assert "path=" not in leading
  723. assert "src=" not in leading
  724. @pytest.mark.offline
  725. def test_enrich_overwrites_surrounding_when_budget_changes(tmp_path):
  726. """Idempotency: rerunning with a smaller budget overwrites the prior
  727. surrounding, demonstrating that ``SURROUNDING_LEADING_MAX_TOKENS``
  728. changes propagate without needing to clear sidecars first."""
  729. base = "doc"
  730. blockid = "b1"
  731. content = "L" * 500 + '<drawing id="im-1" caption="C" />' + "T" * 500
  732. _write_blocks(
  733. tmp_path,
  734. base,
  735. [
  736. {
  737. "type": "content",
  738. "blockid": blockid,
  739. "format": "plain_text",
  740. "content": content,
  741. "heading": "h",
  742. "parent_headings": [],
  743. "level": 1,
  744. }
  745. ],
  746. )
  747. drawings_path = tmp_path / f"{base}.drawings.json"
  748. _write_sidecar(
  749. drawings_path,
  750. "drawings",
  751. {"im-1": {"id": "im-1", "blockid": blockid, "heading": "h"}},
  752. )
  753. tok = _tokenizer()
  754. enrich_sidecars_with_surrounding(
  755. blocks_path=str(tmp_path / f"{base}.blocks.jsonl"),
  756. enabled_modalities={"drawings"},
  757. tokenizer=tok,
  758. leading_max_tokens=300,
  759. trailing_max_tokens=300,
  760. )
  761. first = json.loads(drawings_path.read_text(encoding="utf-8"))["drawings"]["im-1"][
  762. "surrounding"
  763. ]
  764. first_leading_len = len(first["leading"])
  765. first_trailing_len = len(first["trailing"])
  766. enrich_sidecars_with_surrounding(
  767. blocks_path=str(tmp_path / f"{base}.blocks.jsonl"),
  768. enabled_modalities={"drawings"},
  769. tokenizer=tok,
  770. leading_max_tokens=50,
  771. trailing_max_tokens=50,
  772. )
  773. second = json.loads(drawings_path.read_text(encoding="utf-8"))["drawings"]["im-1"][
  774. "surrounding"
  775. ]
  776. # New budget is smaller, so saved surrounding must shrink — proving
  777. # the previous value was overwritten, not preserved.
  778. assert len(second["leading"]) < first_leading_len
  779. assert len(second["trailing"]) < first_trailing_len
  780. assert len(tok.encode(second["leading"])) <= 50
  781. assert len(tok.encode(second["trailing"])) <= 50
  782. @pytest.mark.offline
  783. def test_env_var_invalid_value_falls_back_to_default(monkeypatch):
  784. # An unparseable env value must not crash; it falls back to 2000.
  785. monkeypatch.setenv("SURROUNDING_LEADING_MAX_TOKENS", "not-a-number")
  786. monkeypatch.setenv("SURROUNDING_TRAILING_MAX_TOKENS", "not-a-number")
  787. from lightrag.multimodal_context import (
  788. DEFAULT_SURROUNDING_MAX_TOKENS,
  789. _resolve_surrounding_budget,
  790. )
  791. leading, trailing = _resolve_surrounding_budget(None, None)
  792. assert leading == DEFAULT_SURROUNDING_MAX_TOKENS
  793. assert trailing == DEFAULT_SURROUNDING_MAX_TOKENS