| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893 |
- """Unit tests for the native multimodal surrounding-context extractor.
- See ``docs/NativeMultimodalSurroundingContextPlan-zh.md``.
- These tests use a 1:1 character-token mapping so the expected token
- budgets in each scenario stay obvious without coupling to tiktoken's
- BPE. The helper functions exercised here are pure (no async, no
- network), so the suite runs offline.
- """
- import json
- import pytest
- from lightrag.multimodal_context import (
- build_surrounding,
- enrich_sidecars_with_surrounding,
- find_target_span,
- load_chunk_separators,
- )
- from lightrag.utils import Tokenizer, TokenizerInterface
- class _CharTokenizer(TokenizerInterface):
- def encode(self, content: str):
- return [ord(ch) for ch in content]
- def decode(self, tokens):
- return "".join(chr(t) for t in tokens)
- def _tokenizer() -> Tokenizer:
- return Tokenizer(model_name="char", tokenizer=_CharTokenizer())
- # ---------------------------------------------------------------------------
- # Target-tag locator
- # ---------------------------------------------------------------------------
- @pytest.mark.offline
- def test_find_target_span_drawing_in_mixed_content():
- content = (
- "leading text. "
- '<drawing id="im-abcd-0001" format="png" path="img.png" src="img" /> '
- "trailing text."
- )
- span = find_target_span("drawings", "im-abcd-0001", content)
- assert span is not None
- start, end = span
- assert content[start:end].startswith('<drawing id="im-abcd-0001"')
- assert content[start:end].endswith("/>")
- @pytest.mark.offline
- def test_find_target_span_table_with_id_anywhere_in_attrs():
- # id is not first attribute — locator must still find it.
- content = (
- 'before <table format="json" id="tb-abcd-0007">[[1,2],[3,4]]</table> after'
- )
- span = find_target_span("tables", "tb-abcd-0007", content)
- assert span is not None
- snippet = content[span[0] : span[1]]
- assert snippet.endswith("</table>")
- assert 'id="tb-abcd-0007"' in snippet
- @pytest.mark.offline
- def test_find_target_span_table_cite_marker():
- content = 'before <cite type="table" refid="tb-abcd-0007">表1</cite> after'
- span = find_target_span("tables", "tb-abcd-0007", content)
- assert span is not None
- assert content[span[0] : span[1]].startswith("<cite")
- @pytest.mark.offline
- def test_find_target_span_equation():
- content = 'A <equation id="eq-abcd-0002" format="latex">x^2</equation> B'
- span = find_target_span("equations", "eq-abcd-0002", content)
- assert span is not None
- assert content[span[0] : span[1]].endswith("</equation>")
- @pytest.mark.offline
- def test_find_target_span_unknown_id_returns_none():
- content = '<drawing id="im-1" />'
- assert find_target_span("drawings", "im-other", content) is None
- # ---------------------------------------------------------------------------
- # Drawings & equations surrounding
- # ---------------------------------------------------------------------------
- @pytest.mark.offline
- def test_drawing_surrounding_kept_within_block_only():
- tok = _tokenizer()
- block = (
- "paragraph one ends. paragraph two. "
- '<drawing id="im-1" path="a.png" src="a" /> '
- "paragraph three. paragraph four."
- )
- span = find_target_span("drawings", "im-1", block)
- surr = build_surrounding(
- kind="drawings",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=2000,
- trailing_max_tokens=2000,
- separators=load_chunk_separators(),
- )
- assert surr["leading"].endswith("paragraph two. ")
- assert surr["trailing"].startswith(" paragraph three.")
- @pytest.mark.offline
- def test_equation_surrounding_protects_drawing_atom():
- tok = _tokenizer()
- block = (
- '<drawing id="im-prev" path="a.png" src="a" caption="Fig 1" />'
- " intro text. "
- '<equation id="eq-1" format="latex">a+b=c</equation>'
- " conclusion text."
- )
- span = find_target_span("equations", "eq-1", block)
- surr = build_surrounding(
- kind="equations",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=2000,
- trailing_max_tokens=2000,
- separators=load_chunk_separators(),
- )
- # Parser-internal id/path/src are stripped, but caption survives and
- # the drawing tag stays atomic (not cut in half).
- assert '<drawing caption="Fig 1" />' in surr["leading"]
- assert "/>" in surr["leading"]
- # No half-open drawing/equation tags
- assert surr["leading"].count("<drawing") == surr["leading"].count("/>")
- # ---------------------------------------------------------------------------
- # Tables surrounding: other tables must be stripped before token counting.
- # ---------------------------------------------------------------------------
- @pytest.mark.offline
- def test_table_surrounding_strips_other_tables_before_counting():
- tok = _tokenizer()
- block = (
- '<table id="tb-other" format="json">[["a","b"],["c","d"]]</table> '
- "narrative text describing the report. "
- '<table id="tb-target" format="json">[["x","y"]]</table>'
- " concluding remarks."
- )
- span = find_target_span("tables", "tb-target", block)
- surr = build_surrounding(
- kind="tables",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=2000,
- trailing_max_tokens=2000,
- separators=load_chunk_separators(),
- )
- # Sibling table must NOT appear in surrounding.
- assert "<table" not in surr["leading"]
- assert "</table>" not in surr["leading"]
- assert "<table" not in surr["trailing"]
- assert "narrative text" in surr["leading"]
- assert "concluding remarks" in surr["trailing"]
- @pytest.mark.offline
- def test_table_surrounding_supports_cite_marker_and_strips_sibling_cites():
- tok = _tokenizer()
- block = (
- 'prefix <cite type="table" refid="tb-other">表0</cite> '
- 'narrative <cite type="table" refid="tb-target">表1</cite> suffix'
- )
- span = find_target_span("tables", "tb-target", block)
- surr = build_surrounding(
- kind="tables",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=2000,
- trailing_max_tokens=2000,
- separators=load_chunk_separators(),
- )
- assert "tb-other" not in surr["leading"]
- assert "表0" not in surr["leading"]
- assert "narrative " in surr["leading"]
- assert surr["trailing"] == " suffix"
- # ---------------------------------------------------------------------------
- # Custom CHUNK_R_SEPARATORS via env
- # ---------------------------------------------------------------------------
- @pytest.mark.offline
- def test_chunk_r_separators_env_drives_segment_boundary(monkeypatch):
- # Only the pipe character is a separator: text must split at '|'.
- monkeypatch.setenv("CHUNK_R_SEPARATORS", json.dumps(["|"]))
- seps = load_chunk_separators()
- assert seps == ["|"]
- tok = _tokenizer()
- # 3 segments separated by '|'; budget = 12 chars/tokens; each seg is
- # 10 chars including the trailing '|', so 1 whole segment fits, 2 do not.
- block = 'aaaaaaaaa|bbbbbbbbb|<drawing id="d" />|ccccccccc|ddddddddd'
- span = find_target_span("drawings", "d", block)
- surr = build_surrounding(
- kind="drawings",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=12,
- trailing_max_tokens=12,
- separators=seps,
- )
- # Leading should end at a '|' boundary (one whole segment), not be
- # char-truncated.
- assert surr["leading"].endswith("|")
- # And contain whole segment closest to target.
- assert "bbbbbbbbb|" in surr["leading"]
- # ---------------------------------------------------------------------------
- # Char fallback when the closest segment alone exceeds the budget.
- # ---------------------------------------------------------------------------
- @pytest.mark.offline
- def test_oversized_closest_segment_char_truncated():
- tok = _tokenizer()
- # Single huge "segment" (no separator) right before the target.
- big = "X" * 5000
- block = big + '<drawing id="d" />'
- span = find_target_span("drawings", "d", block)
- surr = build_surrounding(
- kind="drawings",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=200,
- trailing_max_tokens=200,
- separators=load_chunk_separators(),
- )
- assert len(tok.encode(surr["leading"])) <= 200
- assert surr["trailing"] == ""
- # The suffix should be a tail of the X-run.
- assert surr["leading"].endswith("X")
- @pytest.mark.offline
- def test_oversized_trailing_char_truncated_at_head():
- tok = _tokenizer()
- big = "Y" * 5000
- block = '<drawing id="d" />' + big
- span = find_target_span("drawings", "d", block)
- surr = build_surrounding(
- kind="drawings",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=200,
- trailing_max_tokens=200,
- separators=load_chunk_separators(),
- )
- assert len(tok.encode(surr["trailing"])) <= 200
- assert surr["trailing"].startswith("Y")
- # ---------------------------------------------------------------------------
- # Drawings/equations surrounding: JSON / HTML table row trimming.
- # ---------------------------------------------------------------------------
- @pytest.mark.offline
- def test_drawing_surrounding_row_trims_oversized_json_table():
- tok = _tokenizer()
- # 10 rows of repeating cells; whole table is ~> budget.
- rows = [[f"r{i}c0", f"r{i}c1"] for i in range(10)]
- big_table = '<table id="tb-big" format="json">' + json.dumps(rows) + "</table>"
- block = big_table + ' <drawing id="d" />'
- span = find_target_span("drawings", "d", block)
- # Budget chosen so only a few rows of the JSON table fit.
- surr = build_surrounding(
- kind="drawings",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=80,
- trailing_max_tokens=80,
- separators=load_chunk_separators(),
- )
- # Result must be a complete (smaller) <table>...</table>, contain
- # closing tag, and fit within budget.
- leading = surr["leading"]
- assert "<table " in leading
- assert (
- leading.rstrip().endswith("</table>")
- or leading.rstrip().endswith("</table> ")
- or "</table>" in leading
- )
- assert len(tok.encode(leading)) <= 80
- # Should keep tail rows (closest to target — last rows by index)
- assert "r9c0" in leading
- # Should not include rows from the far side.
- assert "r0c0" not in leading
- @pytest.mark.offline
- def test_drawing_surrounding_row_trims_oversized_html_table():
- tok = _tokenizer()
- rows_html = "".join(f"<tr><td>r{i}c0</td><td>r{i}c1</td></tr>" for i in range(10))
- body = f"<tbody>{rows_html}</tbody>"
- big_table = f'<table id="tb-h" format="html">{body}</table>'
- block = f'<drawing id="d" /> {big_table}'
- span = find_target_span("drawings", "d", block)
- surr = build_surrounding(
- kind="drawings",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=120,
- trailing_max_tokens=120,
- separators=load_chunk_separators(),
- )
- trailing = surr["trailing"]
- assert "<table " in trailing
- assert "</table>" in trailing
- assert "<tbody>" in trailing
- assert "</tbody>" in trailing
- assert len(tok.encode(trailing)) <= 120
- # For trailing we keep head rows.
- assert "r0c0" in trailing
- assert "r9c0" not in trailing
- @pytest.mark.offline
- def test_drawing_surrounding_char_trims_oversized_single_json_row():
- tok = _tokenizer()
- row_text = "A" * 200 + "TAIL"
- big_table = (
- '<table id="tb-big" format="json">'
- + json.dumps([[row_text]], ensure_ascii=False)
- + "</table>"
- )
- block = big_table + '<drawing id="d" />'
- span = find_target_span("drawings", "d", block)
- surr = build_surrounding(
- kind="drawings",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=90,
- trailing_max_tokens=90,
- separators=load_chunk_separators(),
- )
- leading = surr["leading"]
- assert leading.startswith("<table ")
- assert leading.endswith("</table>")
- assert "TAIL" in leading
- assert len(tok.encode(leading)) <= 90
- body = leading[leading.index(">") + 1 : -len("</table>")]
- parsed = json.loads(body)
- assert isinstance(parsed, list)
- @pytest.mark.offline
- def test_drawing_surrounding_char_trims_oversized_single_html_row():
- tok = _tokenizer()
- row_text = "HEAD" + "B" * 200
- big_table = (
- '<table id="tb-h" format="html">'
- f"<tbody><tr><td>{row_text}</td></tr></tbody>"
- "</table>"
- )
- block = f'<drawing id="d" />{big_table}'
- span = find_target_span("drawings", "d", block)
- surr = build_surrounding(
- kind="drawings",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=100,
- trailing_max_tokens=100,
- separators=load_chunk_separators(),
- )
- trailing = surr["trailing"]
- assert trailing.startswith("<table ")
- assert trailing.endswith("</table>")
- assert "<tr><td>" in trailing
- assert "HEAD" in trailing
- assert len(tok.encode(trailing)) <= 100
- # ---------------------------------------------------------------------------
- # enrich_sidecars_with_surrounding: idempotency + modality gating.
- # ---------------------------------------------------------------------------
- def _write_blocks(tmp_path, base, blocks):
- blocks_path = tmp_path / f"{base}.blocks.jsonl"
- lines = [json.dumps({"type": "meta", "format": "lightrag"})]
- for b in blocks:
- lines.append(json.dumps(b, ensure_ascii=False))
- blocks_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
- return blocks_path
- def _write_sidecar(path, root_key, items):
- path.write_text(
- json.dumps(
- {"version": "1.0", root_key: items},
- ensure_ascii=False,
- indent=2,
- ),
- encoding="utf-8",
- )
- @pytest.mark.offline
- def test_enrich_only_updates_enabled_modalities(tmp_path):
- base = "doc"
- blockid = "b1"
- content = (
- "intro. "
- '<drawing id="im-1" path="a.png" src="a" />'
- " middle "
- '<table id="tb-1" format="json">[["a"]]</table>'
- " tail "
- '<equation id="eq-1" format="latex">e</equation>'
- " end."
- )
- _write_blocks(
- tmp_path,
- base,
- [
- {
- "type": "content",
- "blockid": blockid,
- "format": "plain_text",
- "content": content,
- "heading": "h",
- "parent_headings": [],
- "level": 1,
- }
- ],
- )
- drawings_path = tmp_path / f"{base}.drawings.json"
- tables_path = tmp_path / f"{base}.tables.json"
- equations_path = tmp_path / f"{base}.equations.json"
- _write_sidecar(
- drawings_path,
- "drawings",
- {"im-1": {"id": "im-1", "blockid": blockid, "heading": "h"}},
- )
- _write_sidecar(
- tables_path,
- "tables",
- {"tb-1": {"id": "tb-1", "blockid": blockid, "heading": "h"}},
- )
- _write_sidecar(
- equations_path,
- "equations",
- {"eq-1": {"id": "eq-1", "blockid": blockid, "heading": "h"}},
- )
- counts = enrich_sidecars_with_surrounding(
- blocks_path=str(tmp_path / f"{base}.blocks.jsonl"),
- enabled_modalities={"drawings"},
- tokenizer=_tokenizer(),
- leading_max_tokens=2000,
- trailing_max_tokens=2000,
- )
- assert counts["drawings"] == 1
- assert counts["tables"] == 0
- assert counts["equations"] == 0
- drawings = json.loads(drawings_path.read_text(encoding="utf-8"))
- tables = json.loads(tables_path.read_text(encoding="utf-8"))
- equations = json.loads(equations_path.read_text(encoding="utf-8"))
- assert "surrounding" in drawings["drawings"]["im-1"]
- assert drawings["drawings"]["im-1"]["surrounding"]["leading"].startswith("intro.")
- assert "surrounding" not in tables["tables"]["tb-1"]
- assert "surrounding" not in equations["equations"]["eq-1"]
- @pytest.mark.offline
- def test_enrich_runs_even_when_llm_analyze_result_present(tmp_path):
- """Idempotency: existing ``llm_analyze_result`` does not block
- surrounding backfill — we treat the two fields as independent."""
- base = "doc"
- blockid = "b1"
- content = 'prefix. <drawing id="im-1" path="a.png" src="a" /> suffix.'
- _write_blocks(
- tmp_path,
- base,
- [
- {
- "type": "content",
- "blockid": blockid,
- "format": "plain_text",
- "content": content,
- "heading": "h",
- "parent_headings": [],
- "level": 1,
- }
- ],
- )
- drawings_path = tmp_path / f"{base}.drawings.json"
- _write_sidecar(
- drawings_path,
- "drawings",
- {
- "im-1": {
- "id": "im-1",
- "blockid": blockid,
- "heading": "h",
- "llm_analyze_result": {
- "name": "x",
- "summary": "",
- "detail_description": "",
- },
- }
- },
- )
- counts = enrich_sidecars_with_surrounding(
- blocks_path=str(tmp_path / f"{base}.blocks.jsonl"),
- enabled_modalities={"drawings"},
- tokenizer=_tokenizer(),
- leading_max_tokens=2000,
- trailing_max_tokens=2000,
- )
- assert counts["drawings"] == 1
- payload = json.loads(drawings_path.read_text(encoding="utf-8"))
- item = payload["drawings"]["im-1"]
- assert item["llm_analyze_result"]["name"] == "x" # untouched
- assert item["surrounding"]["leading"].startswith("prefix.")
- assert item["surrounding"]["trailing"].startswith(" suffix.")
- @pytest.mark.offline
- def test_enrich_does_not_cross_block_boundaries(tmp_path):
- base = "doc"
- block_a = "earlier block content."
- block_b = 'later block. <drawing id="im-1" path="a.png" src="a" /> tail.'
- _write_blocks(
- tmp_path,
- base,
- [
- {
- "type": "content",
- "blockid": "bA",
- "format": "plain_text",
- "content": block_a,
- "heading": "h1",
- "parent_headings": [],
- "level": 1,
- },
- {
- "type": "content",
- "blockid": "bB",
- "format": "plain_text",
- "content": block_b,
- "heading": "h2",
- "parent_headings": [],
- "level": 1,
- },
- ],
- )
- drawings_path = tmp_path / f"{base}.drawings.json"
- _write_sidecar(
- drawings_path,
- "drawings",
- {"im-1": {"id": "im-1", "blockid": "bB", "heading": "h2"}},
- )
- enrich_sidecars_with_surrounding(
- blocks_path=str(tmp_path / f"{base}.blocks.jsonl"),
- enabled_modalities={"drawings"},
- tokenizer=_tokenizer(),
- leading_max_tokens=2000,
- trailing_max_tokens=2000,
- )
- payload = json.loads(drawings_path.read_text(encoding="utf-8"))
- surr = payload["drawings"]["im-1"]["surrounding"]
- # Must come from block B only — content of block A absent.
- assert "earlier block content" not in surr["leading"]
- assert surr["leading"].startswith("later block.")
- # ---------------------------------------------------------------------------
- # Per-half token budgets via SURROUNDING_LEADING/TRAILING_MAX_TOKENS env vars.
- # ---------------------------------------------------------------------------
- @pytest.mark.offline
- def test_env_var_leading_and_trailing_budgets_apply_independently(
- tmp_path, monkeypatch
- ):
- # Asymmetric budgets must produce asymmetric leading / trailing sizes.
- monkeypatch.setenv("SURROUNDING_LEADING_MAX_TOKENS", "5")
- monkeypatch.setenv("SURROUNDING_TRAILING_MAX_TOKENS", "20")
- base = "doc"
- blockid = "b1"
- content = "X" * 200 + '<drawing id="im-1" path="a.png" src="a" />' + "Y" * 200
- _write_blocks(
- tmp_path,
- base,
- [
- {
- "type": "content",
- "blockid": blockid,
- "format": "plain_text",
- "content": content,
- "heading": "h",
- "parent_headings": [],
- "level": 1,
- }
- ],
- )
- drawings_path = tmp_path / f"{base}.drawings.json"
- _write_sidecar(
- drawings_path,
- "drawings",
- {"im-1": {"id": "im-1", "blockid": blockid, "heading": "h"}},
- )
- tok = _tokenizer()
- enrich_sidecars_with_surrounding(
- blocks_path=str(tmp_path / f"{base}.blocks.jsonl"),
- enabled_modalities={"drawings"},
- tokenizer=tok,
- )
- surr = json.loads(drawings_path.read_text(encoding="utf-8"))["drawings"]["im-1"][
- "surrounding"
- ]
- assert len(tok.encode(surr["leading"])) <= 5
- assert len(tok.encode(surr["trailing"])) <= 20
- # Trailing is allowed to use its larger budget, so it must be strictly
- # longer than leading here.
- assert len(surr["trailing"]) > len(surr["leading"])
- # ---------------------------------------------------------------------------
- # Parser-internal markup stripping inside surrounding (mirrors what
- # ``strip_internal_multimodal_markup_for_extraction`` does for chunk
- # content before entity extraction). The cleaning happens *before*
- # token-budgeted truncation, so the saved budget reflects what the
- # LLM actually receives and a truncation point can never land inside
- # an unprocessed ``id="…"`` attribute.
- # ---------------------------------------------------------------------------
- @pytest.mark.offline
- def test_surrounding_strips_drawing_id_path_src():
- tok = _tokenizer()
- block = (
- "leading prose. "
- '<drawing id="im-x" path="figs/a.png" src="raw/a.png" caption="Fig 1" />'
- " between. "
- '<equation id="eq-target" format="latex">x=1</equation>'
- " trailing prose."
- )
- span = find_target_span("equations", "eq-target", block)
- surr = build_surrounding(
- kind="equations",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=2000,
- trailing_max_tokens=2000,
- separators=load_chunk_separators(),
- )
- leading = surr["leading"]
- assert '<drawing caption="Fig 1" />' in leading
- assert 'id="im-x"' not in leading
- assert "path=" not in leading
- assert "src=" not in leading
- @pytest.mark.offline
- def test_surrounding_strips_table_internal_id():
- tok = _tokenizer()
- block = (
- "prefix. "
- '<table id="tb-x" format="json" caption="Sales">[[1,2],[3,4]]</table>'
- " between. "
- '<drawing id="im-target" caption="Fig 2" />'
- " suffix."
- )
- span = find_target_span("drawings", "im-target", block)
- surr = build_surrounding(
- kind="drawings",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=2000,
- trailing_max_tokens=2000,
- separators=load_chunk_separators(),
- )
- leading = surr["leading"]
- assert '<table format="json" caption="Sales">[[1,2],[3,4]]</table>' in leading
- assert 'id="tb-x"' not in leading
- @pytest.mark.offline
- def test_surrounding_strips_cite_refid_keeping_visible_text():
- tok = _tokenizer()
- block = (
- "see "
- '<cite type="table" refid="tb-x">Table 1</cite>'
- " for details. "
- '<drawing id="im-target" caption="Fig 3" />'
- " end."
- )
- span = find_target_span("drawings", "im-target", block)
- surr = build_surrounding(
- kind="drawings",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=2000,
- trailing_max_tokens=2000,
- separators=load_chunk_separators(),
- )
- leading = surr["leading"]
- # Surrounding path uses keep_cite_tag=True: the cite wrapper survives
- # (so the VLM/LLM can tell "Table 1" is a reference to an external
- # table, not inline prose) but the parser-internal refid is gone.
- assert '<cite type="table">Table 1</cite>' in leading
- assert "refid=" not in leading
- assert "tb-x" not in leading
- @pytest.mark.offline
- def test_surrounding_keeps_equation_cite_tag_and_strips_refid():
- """In production, equations without LaTeX content emit as
- ``<cite type="equation" refid="eq-…">公式 N</cite>`` rather than a
- full ``<equation>`` tag. Surrounding must keep the wrapper so the
- multimodal analyzer can recognize the visible label as an external
- referent, not inline prose."""
- tok = _tokenizer()
- block = (
- "see "
- '<cite type="equation" refid="eq-y">公式 2</cite>'
- " above. "
- '<drawing id="im-target" caption="Fig 4" />'
- " end."
- )
- span = find_target_span("drawings", "im-target", block)
- surr = build_surrounding(
- kind="drawings",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=2000,
- trailing_max_tokens=2000,
- separators=load_chunk_separators(),
- )
- leading = surr["leading"]
- assert '<cite type="equation">公式 2</cite>' in leading
- assert "refid=" not in leading
- assert "eq-y" not in leading
- @pytest.mark.offline
- def test_strip_happens_before_budget_truncation():
- """Regression guard for the strip-before-truncate ordering.
- Constructs a leading source whose raw form (with id/path/src) exceeds
- the budget while its stripped form fits. If strip ran *after*
- truncation, the budget would be measured against the bloated raw
- string and the saved surrounding would be cut early (possibly mid-
- attribute, leaving ``id="…`` residue).
- """
- tok = _tokenizer()
- # Raw drawing tag including attrs (~67 chars), stripped form is
- # just '<drawing caption="C" />' (~24 chars). Budget at 30 sits
- # between the two — raw is too big, stripped fits.
- block = (
- '<drawing id="im-prev" path="some/long/path.png" src="raw/long/path.png"'
- ' caption="C" />'
- '<equation id="eq-1" format="latex">y</equation>'
- " tail."
- )
- span = find_target_span("equations", "eq-1", block)
- surr = build_surrounding(
- kind="equations",
- block_content=block,
- span=span,
- tokenizer=tok,
- leading_max_tokens=30,
- trailing_max_tokens=2000,
- separators=load_chunk_separators(),
- )
- leading = surr["leading"]
- # Whole stripped tag must be present — proves strip ran before
- # the budget gate.
- assert leading == '<drawing caption="C" />'
- # And no parser-internal markers leaked through.
- assert "id=" not in leading
- assert "path=" not in leading
- assert "src=" not in leading
- @pytest.mark.offline
- def test_enrich_overwrites_surrounding_when_budget_changes(tmp_path):
- """Idempotency: rerunning with a smaller budget overwrites the prior
- surrounding, demonstrating that ``SURROUNDING_LEADING_MAX_TOKENS``
- changes propagate without needing to clear sidecars first."""
- base = "doc"
- blockid = "b1"
- content = "L" * 500 + '<drawing id="im-1" caption="C" />' + "T" * 500
- _write_blocks(
- tmp_path,
- base,
- [
- {
- "type": "content",
- "blockid": blockid,
- "format": "plain_text",
- "content": content,
- "heading": "h",
- "parent_headings": [],
- "level": 1,
- }
- ],
- )
- drawings_path = tmp_path / f"{base}.drawings.json"
- _write_sidecar(
- drawings_path,
- "drawings",
- {"im-1": {"id": "im-1", "blockid": blockid, "heading": "h"}},
- )
- tok = _tokenizer()
- enrich_sidecars_with_surrounding(
- blocks_path=str(tmp_path / f"{base}.blocks.jsonl"),
- enabled_modalities={"drawings"},
- tokenizer=tok,
- leading_max_tokens=300,
- trailing_max_tokens=300,
- )
- first = json.loads(drawings_path.read_text(encoding="utf-8"))["drawings"]["im-1"][
- "surrounding"
- ]
- first_leading_len = len(first["leading"])
- first_trailing_len = len(first["trailing"])
- enrich_sidecars_with_surrounding(
- blocks_path=str(tmp_path / f"{base}.blocks.jsonl"),
- enabled_modalities={"drawings"},
- tokenizer=tok,
- leading_max_tokens=50,
- trailing_max_tokens=50,
- )
- second = json.loads(drawings_path.read_text(encoding="utf-8"))["drawings"]["im-1"][
- "surrounding"
- ]
- # New budget is smaller, so saved surrounding must shrink — proving
- # the previous value was overwritten, not preserved.
- assert len(second["leading"]) < first_leading_len
- assert len(second["trailing"]) < first_trailing_len
- assert len(tok.encode(second["leading"])) <= 50
- assert len(tok.encode(second["trailing"])) <= 50
- @pytest.mark.offline
- def test_env_var_invalid_value_falls_back_to_default(monkeypatch):
- # An unparseable env value must not crash; it falls back to 2000.
- monkeypatch.setenv("SURROUNDING_LEADING_MAX_TOKENS", "not-a-number")
- monkeypatch.setenv("SURROUNDING_TRAILING_MAX_TOKENS", "not-a-number")
- from lightrag.multimodal_context import (
- DEFAULT_SURROUNDING_MAX_TOKENS,
- _resolve_surrounding_budget,
- )
- leading, trailing = _resolve_surrounding_budget(None, None)
- assert leading == DEFAULT_SURROUNDING_MAX_TOKENS
- assert trailing == DEFAULT_SURROUNDING_MAX_TOKENS
|