test_paragraph_semantic_table_split.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604
  1. """Regression tests for paragraph-semantic Stage B oversized-table handling."""
  2. import json
  3. import pytest
  4. from lightrag.chunker.paragraph_semantic import (
  5. _detect_table_format,
  6. _expand_block_with_table_splits,
  7. _split_html_rows,
  8. _split_long_block,
  9. _split_rows_by_tokens,
  10. _split_table_text,
  11. chunking_by_paragraph_semantic,
  12. )
  13. from lightrag.utils import Tokenizer, TokenizerInterface
  14. class _CharTokenizer(TokenizerInterface):
  15. """1:1 character-to-token mapping — keeps math obvious in assertions."""
  16. def encode(self, content: str):
  17. return [ord(ch) for ch in content]
  18. def decode(self, tokens):
  19. return "".join(chr(t) for t in tokens)
  20. def _make_tokenizer() -> Tokenizer:
  21. return Tokenizer(model_name="char", tokenizer=_CharTokenizer())
  22. @pytest.mark.offline
  23. def test_split_rows_by_tokens_few_rows_huge_total_no_empty_slice():
  24. # Reproduces the bug where target_chunks > len(rows) made target_rows
  25. # < 1, so int((i+1)*target_rows) collapsed to start and the loop
  26. # appended empty slices (which would later serialise as <table>[]…).
  27. tokenizer = _make_tokenizer()
  28. # 3 rows that each individually exceed target_max — forces
  29. # math.ceil(total/target_ideal) and math.ceil(total/target_max) to
  30. # both be much greater than len(rows).
  31. rows = [
  32. [{"col": "x" * 800}],
  33. [{"col": "y" * 800}],
  34. [{"col": "z" * 800}],
  35. ]
  36. chunks = _split_rows_by_tokens(
  37. rows,
  38. tokenizer,
  39. target_max=200,
  40. target_ideal=150,
  41. last_min=64,
  42. )
  43. assert chunks, "expected at least one chunk"
  44. for chunk in chunks:
  45. assert chunk, "Stage B must never emit an empty row slice"
  46. # Concatenation preserves all rows in order.
  47. flat: list = []
  48. for chunk in chunks:
  49. flat.extend(chunk)
  50. assert flat == rows
  51. @pytest.mark.offline
  52. def test_split_rows_by_tokens_balanced_split_yields_one_row_per_chunk():
  53. # When target_chunks gets capped at len(rows), each chunk holds one
  54. # row — verifies the cap kicks in and forward progress is preserved.
  55. tokenizer = _make_tokenizer()
  56. rows = [[{"col": "a" * 300}] for _ in range(4)]
  57. chunks = _split_rows_by_tokens(
  58. rows,
  59. tokenizer,
  60. target_max=200,
  61. target_ideal=150,
  62. last_min=10, # low enough that the tail-merge step doesn't fire
  63. )
  64. assert all(chunk for chunk in chunks)
  65. # Each row appears exactly once across the chunks.
  66. flat: list = []
  67. for chunk in chunks:
  68. flat.extend(chunk)
  69. assert flat == rows
  70. def _build_oversized_table_text(num_rows: int, row_payload_size: int) -> str:
  71. rows = [[f"r{idx}-" + "x" * row_payload_size] for idx in range(num_rows)]
  72. return f'<table id="tb-1" format="json">{json.dumps(rows)}</table>'
  73. def _write_blocks_jsonl(tmp_path, content: str) -> str:
  74. path = tmp_path / "doc.blocks.jsonl"
  75. row = {
  76. "type": "content",
  77. "heading": "Section",
  78. "parent_headings": [],
  79. "level": 2,
  80. "content": content,
  81. }
  82. path.write_text(json.dumps(row, ensure_ascii=False), encoding="utf-8")
  83. return str(path)
  84. @pytest.mark.offline
  85. def test_expand_block_assigns_first_and_last_roles_to_glued_blocks():
  86. # An oversized table sandwiched between leading and trailing paragraphs
  87. # produces three slices: "first" (glued with leading paras),
  88. # "middle" (standalone), "last" (glued with trailing paras). Before
  89. # the fix, the first/last blocks defaulted to "none" and lost their
  90. # directional merge-protection.
  91. tokenizer = _make_tokenizer()
  92. table_text = _build_oversized_table_text(num_rows=6, row_payload_size=200)
  93. block = {
  94. "heading": "Section",
  95. "parent_headings": ["Doc"],
  96. "level": 2,
  97. "paragraphs": [
  98. {"text": "lead paragraph", "is_table": False},
  99. {"text": table_text, "is_table": True},
  100. {"text": "trailing paragraph", "is_table": False},
  101. ],
  102. }
  103. out = _expand_block_with_table_splits(
  104. block,
  105. tokenizer=tokenizer,
  106. table_max=400,
  107. table_ideal=300,
  108. table_min_last=128,
  109. )
  110. roles = [b["table_chunk_role"] for b in out]
  111. assert roles[0] == "first", f"expected leading block role=first, got {roles}"
  112. assert roles[-1] == "last", f"expected trailing block role=last, got {roles}"
  113. assert all(
  114. r == "middle" for r in roles[1:-1]
  115. ), f"expected middle slices between first/last, got {roles}"
  116. # Boundary glue still works: leading text sits inside the first block,
  117. # trailing text sits inside the last block.
  118. assert any(
  119. p["text"] == "lead paragraph" for p in out[0]["paragraphs"]
  120. ), "leading paragraph must glue with the first table slice"
  121. assert any(
  122. p["text"] == "trailing paragraph" for p in out[-1]["paragraphs"]
  123. ), "trailing paragraph must glue with the last table slice"
  124. assert all(
  125. "表格片段" not in b["heading"] for b in out
  126. ), "Stage B should not expose legacy table-fragment heading suffixes"
  127. @pytest.mark.offline
  128. def test_expand_block_two_oversized_tables_separates_last_and_first_roles():
  129. # Two oversized tables in the same heading block: the tail of the first
  130. # split must carry role="last" and not be silently merged into the
  131. # head of the second split (which must carry role="first").
  132. tokenizer = _make_tokenizer()
  133. block = {
  134. "heading": "Section",
  135. "parent_headings": [],
  136. "level": 2,
  137. "paragraphs": [
  138. {
  139. "text": _build_oversized_table_text(num_rows=4, row_payload_size=200),
  140. "is_table": True,
  141. },
  142. {"text": "between tables", "is_table": False},
  143. {
  144. "text": _build_oversized_table_text(num_rows=4, row_payload_size=200),
  145. "is_table": True,
  146. },
  147. ],
  148. }
  149. out = _expand_block_with_table_splits(
  150. block,
  151. tokenizer=tokenizer,
  152. table_max=400,
  153. table_ideal=300,
  154. table_min_last=128,
  155. )
  156. roles = [b["table_chunk_role"] for b in out]
  157. # We expect the role sequence to start with "first", end with "last",
  158. # and contain at least one "last" -> "first" transition (the boundary
  159. # between the two oversized tables) without any boundary block losing
  160. # its role.
  161. assert roles[0] == "first"
  162. assert roles[-1] == "last"
  163. assert "last" in roles
  164. # The transition: there must be a "last" immediately followed by a
  165. # "first" somewhere in the middle of the role sequence.
  166. transitions = list(zip(roles, roles[1:]))
  167. assert (
  168. ("last", "first") in transitions
  169. ), f"expected a last->first boundary between the two split tables, got {roles}"
  170. @pytest.mark.offline
  171. def test_expand_block_duplicates_short_text_between_oversized_tables():
  172. tokenizer = _make_tokenizer()
  173. bridge = "between tables"
  174. block = {
  175. "heading": "Section",
  176. "parent_headings": [],
  177. "level": 2,
  178. "paragraphs": [
  179. {
  180. "text": _build_oversized_table_text(num_rows=4, row_payload_size=200),
  181. "is_table": True,
  182. },
  183. {"text": bridge, "is_table": False},
  184. {
  185. "text": _build_oversized_table_text(num_rows=4, row_payload_size=200),
  186. "is_table": True,
  187. },
  188. ],
  189. }
  190. out = _expand_block_with_table_splits(
  191. block,
  192. tokenizer=tokenizer,
  193. table_max=400,
  194. table_ideal=300,
  195. table_min_last=128,
  196. target_max=800,
  197. chunk_overlap_token_size=100,
  198. )
  199. roles = [b["table_chunk_role"] for b in out]
  200. boundary_idx = next(
  201. i
  202. for i, (left, right) in enumerate(zip(roles, roles[1:]))
  203. if (left, right) == ("last", "first")
  204. )
  205. assert bridge in out[boundary_idx]["content"]
  206. assert bridge in out[boundary_idx + 1]["content"]
  207. @pytest.mark.offline
  208. def test_expand_block_emits_middle_text_when_table_bridge_is_long():
  209. tokenizer = _make_tokenizer()
  210. bridge = ("A" * 45) + ("B" * 50) + ("C" * 45)
  211. block = {
  212. "heading": "Section",
  213. "parent_headings": [],
  214. "level": 2,
  215. "paragraphs": [
  216. {
  217. "text": _build_oversized_table_text(num_rows=6, row_payload_size=120),
  218. "is_table": True,
  219. },
  220. {"text": bridge, "is_table": False},
  221. {
  222. "text": _build_oversized_table_text(num_rows=6, row_payload_size=120),
  223. "is_table": True,
  224. },
  225. ],
  226. }
  227. out = _expand_block_with_table_splits(
  228. block,
  229. tokenizer=tokenizer,
  230. table_max=260,
  231. table_ideal=180,
  232. table_min_last=32,
  233. target_max=400,
  234. chunk_overlap_token_size=45,
  235. )
  236. middle_idx = next(
  237. i
  238. for i, blk in enumerate(out)
  239. if blk["table_chunk_role"] == "none" and blk["content"] == "B" * 50
  240. )
  241. assert out[middle_idx - 1]["table_chunk_role"] == "last"
  242. assert "A" * 45 in out[middle_idx - 1]["content"]
  243. assert "B" * 50 not in out[middle_idx - 1]["content"]
  244. assert out[middle_idx + 1]["table_chunk_role"] == "first"
  245. assert out[middle_idx + 1]["content"].startswith("C" * 45)
  246. assert "B" * 50 not in out[middle_idx + 1]["content"]
  247. assert all(b["tokens"] <= 400 for b in out), [b["tokens"] for b in out]
  248. @pytest.mark.offline
  249. def test_public_chunking_adds_part_suffixes_to_all_table_split_fragments(tmp_path):
  250. tokenizer = _make_tokenizer()
  251. body = "\n".join(
  252. [
  253. "lead paragraph",
  254. _build_oversized_table_text(num_rows=6, row_payload_size=200),
  255. "trailing paragraph",
  256. ]
  257. )
  258. blocks_path = _write_blocks_jsonl(tmp_path, body)
  259. chunks = chunking_by_paragraph_semantic(
  260. tokenizer,
  261. body,
  262. chunk_token_size=800,
  263. blocks_path=blocks_path,
  264. chunk_overlap_token_size=0,
  265. )
  266. assert len(chunks) > 1
  267. assert [chunk["heading"]["heading"] for chunk in chunks] == [
  268. f"Section [part {idx}]" for idx in range(1, len(chunks) + 1)
  269. ]
  270. assert all("表格片段" not in chunk["heading"]["heading"] for chunk in chunks)
  271. # ---------------------------------------------------------------------------
  272. # Table-aware fallback tests (row-boundary first, character last).
  273. # ---------------------------------------------------------------------------
  274. @pytest.mark.offline
  275. def test_detect_table_format_explicit_attr():
  276. assert _detect_table_format('id="t1" format="json"', "[]") == "json"
  277. assert _detect_table_format("format='html'", "<tr></tr>") == "html"
  278. # Unknown formats fall through (force the caller to use char fallback).
  279. assert _detect_table_format('format="markdown"', "...") is None
  280. @pytest.mark.offline
  281. def test_detect_table_format_sniff_when_attrs_silent():
  282. assert _detect_table_format("", '[{"a":1}]') == "json"
  283. assert _detect_table_format("", "<tr><td>x</td></tr>") == "html"
  284. # Body that doesn't look like JSON or HTML → unknown.
  285. assert _detect_table_format("", "plain text rows") is None
  286. @pytest.mark.offline
  287. def test_split_html_rows_extracts_tr_elements():
  288. body = (
  289. "<thead><tr><th>h</th></tr></thead>"
  290. "<tbody><tr><td>a</td></tr><tr><td>b</td></tr></tbody>"
  291. )
  292. rows = _split_html_rows(body)
  293. assert rows is not None
  294. assert len(rows) == 3
  295. # Each row carries its parent wrapper so the chunk serialiser can
  296. # rebuild <thead>/<tbody> instead of dropping them silently.
  297. assert [w for w, _ in rows] == ["thead", "tbody", "tbody"]
  298. assert all(tr.startswith("<tr") and tr.endswith("</tr>") for _, tr in rows)
  299. @pytest.mark.offline
  300. def test_split_html_rows_no_tr_returns_none():
  301. assert _split_html_rows("just text, no rows") is None
  302. assert _split_html_rows("") is None
  303. @pytest.mark.offline
  304. def test_split_table_text_single_row_oversized_falls_to_character_split():
  305. # A 1-row table whose single cell is huge cannot be reduced via row
  306. # boundary — the function must fall to character splitting and respect
  307. # target_max on every output piece.
  308. tokenizer = _make_tokenizer()
  309. rows = [[{"col": "x" * 2000}]]
  310. table_text = f'<table id="tb-1" format="json">{json.dumps(rows)}</table>'
  311. pieces = _split_table_text(
  312. table_text,
  313. tokenizer=tokenizer,
  314. target_max=500,
  315. target_ideal=350,
  316. last_min=128,
  317. )
  318. assert len(pieces) >= 2, "single-row oversized table must produce multiple pieces"
  319. # Every piece honors the cap (this is the contract violation the user
  320. # reported when the previous code emitted a single 2000-token table).
  321. assert all(_count_tokens(tokenizer, p) <= 500 for p in pieces)
  322. @pytest.mark.offline
  323. def test_split_table_text_multirow_one_huge_row_mixed_output():
  324. # A multi-row table where most rows fit but one row is itself huge.
  325. # The fit-able rows must keep <table>...</table> wrapping; the huge
  326. # row's chunk falls to character splitting.
  327. tokenizer = _make_tokenizer()
  328. small_row = [{"col": "ok"}]
  329. huge_row = [{"col": "z" * 2000}]
  330. rows = [small_row, huge_row, small_row]
  331. table_text = f'<table id="tb-1" format="json">{json.dumps(rows)}</table>'
  332. pieces = _split_table_text(
  333. table_text,
  334. tokenizer=tokenizer,
  335. target_max=500,
  336. target_ideal=350,
  337. last_min=64,
  338. )
  339. assert all(_count_tokens(tokenizer, p) <= 500 for p in pieces)
  340. # At least one fragment for the small rows must survive as legal markup.
  341. table_pieces = [p for p in pieces if p.startswith("<table ")]
  342. assert table_pieces, "expected at least one <table>-wrapped piece for fit-able rows"
  343. # The huge row must produce non-table text fragments (character split).
  344. text_pieces = [p for p in pieces if not p.startswith("<table ")]
  345. assert text_pieces, "huge row must yield character-split text fragments"
  346. @pytest.mark.offline
  347. def test_split_table_text_html_table_split_by_tr():
  348. # HTML-format table: rows are <tr>...</tr>; each output fragment must
  349. # remain a legal <table {attrs}>{rows}</table> string.
  350. tokenizer = _make_tokenizer()
  351. body = "".join(f"<tr><td>{'r' * 200}</td></tr>" for _ in range(5))
  352. table_text = f'<table id="tb-h1" format="html">{body}</table>'
  353. pieces = _split_table_text(
  354. table_text,
  355. tokenizer=tokenizer,
  356. target_max=500,
  357. target_ideal=350,
  358. last_min=64,
  359. )
  360. assert len(pieces) >= 2
  361. # All pieces should be legal <table>...</table> fragments (none of the
  362. # rows individually exceeds target_max, so no character fallback).
  363. assert all(p.startswith("<table ") and p.endswith("</table>") for p in pieces)
  364. assert all(_count_tokens(tokenizer, p) <= 500 for p in pieces)
  365. @pytest.mark.offline
  366. def test_split_table_text_html_preserves_thead_tbody_wrappers():
  367. # When an HTML table mixes <thead> and <tbody>, the row splitter
  368. # used to drop the wrappers entirely — the chunked output came back
  369. # as bare <tr> sequences. The fix re-emits each wrapper around its
  370. # rows in every chunk so the table structure survives splitting.
  371. tokenizer = _make_tokenizer()
  372. head_row = "<tr><th>" + ("h" * 80) + "</th></tr>"
  373. body_rows = "".join(f"<tr><td>{'b' * 80}{i}</td></tr>" for i in range(4))
  374. body = f"<thead>{head_row}</thead><tbody>{body_rows}</tbody>"
  375. table_text = f'<table id="tb-mixed" format="html">{body}</table>'
  376. pieces = _split_table_text(
  377. table_text,
  378. tokenizer=tokenizer,
  379. target_max=400,
  380. target_ideal=280,
  381. last_min=64,
  382. )
  383. # Multiple chunks expected and every chunk must remain a legal
  384. # <table>-wrapped fragment.
  385. assert len(pieces) >= 2
  386. assert all(p.startswith("<table ") and p.endswith("</table>") for p in pieces)
  387. # Every chunk that contains the header row must still wrap it in
  388. # <thead>...</thead>; every chunk with body rows must wrap them in
  389. # <tbody>...</tbody>. Before the fix, both wrappers vanished.
  390. for piece in pieces:
  391. if "<th>" in piece:
  392. assert "<thead>" in piece and "</thead>" in piece, piece
  393. if "<td>" in piece:
  394. assert "<tbody>" in piece and "</tbody>" in piece, piece
  395. # Round-trip: concatenating just the row payloads from every chunk
  396. # recovers the original row sequence in order.
  397. extracted_rows: list[str] = []
  398. import re
  399. for piece in pieces:
  400. extracted_rows.extend(
  401. re.findall(r"<tr\b[^>]*>.*?</tr>", piece, re.DOTALL | re.IGNORECASE)
  402. )
  403. expected_rows = re.findall(r"<tr\b[^>]*>.*?</tr>", body, re.DOTALL | re.IGNORECASE)
  404. assert extracted_rows == expected_rows
  405. @pytest.mark.offline
  406. def test_split_table_text_unknown_format_falls_to_character():
  407. # No format attr, body that doesn't look like JSON/HTML → unknown.
  408. tokenizer = _make_tokenizer()
  409. table_text = '<table id="weird">' + ("plain row text " * 300) + "</table>"
  410. pieces = _split_table_text(
  411. table_text,
  412. tokenizer=tokenizer,
  413. target_max=500,
  414. target_ideal=350,
  415. last_min=64,
  416. )
  417. assert len(pieces) >= 2
  418. assert all(_count_tokens(tokenizer, p) <= 500 for p in pieces)
  419. @pytest.mark.offline
  420. def test_expand_block_single_row_table_no_longer_left_intact():
  421. # Stage B integration: previously a single-row oversized table was
  422. # appended back to cur_paras unchanged, leading the block to reach
  423. # Stage C with the table whole and the character fallback shredding
  424. # the <table> tag. After the fix, Stage B itself produces multiple
  425. # pieces for such a table.
  426. tokenizer = _make_tokenizer()
  427. rows = [[{"col": "x" * 2000}]] # single huge row
  428. table_text = f'<table id="tb-1" format="json">{json.dumps(rows)}</table>'
  429. block = {
  430. "heading": "Section",
  431. "parent_headings": [],
  432. "level": 2,
  433. "paragraphs": [
  434. {"text": "lead", "is_table": False},
  435. {"text": table_text, "is_table": True},
  436. {"text": "trail", "is_table": False},
  437. ],
  438. }
  439. out = _expand_block_with_table_splits(
  440. block,
  441. tokenizer=tokenizer,
  442. table_max=400,
  443. table_ideal=300,
  444. table_min_last=128,
  445. )
  446. # Multiple sub-blocks must be produced; the oversized table no longer
  447. # passes through whole.
  448. assert len(out) >= 2
  449. # First/last role protection still fires when the table was reduced.
  450. roles = [b["table_chunk_role"] for b in out]
  451. assert (
  452. "first" in roles or "last" in roles
  453. ), f"expected first/last role assignment after table split, got {roles}"
  454. @pytest.mark.offline
  455. def test_split_long_block_table_dominant_no_anchor_keeps_some_table_markup():
  456. # Stage C integration: a block dominated by an oversized table with no
  457. # anchor candidates used to be character-split end-to-end, destroying
  458. # the <table> tag. After the fix, at least some output sub-blocks
  459. # retain legal <table>...</table> markup for the rows that fit.
  460. tokenizer = _make_tokenizer()
  461. # Many small rows -> row-boundary split produces multiple legal
  462. # <table> fragments, none of which individually exceed target_max.
  463. rows = [[{"col": f"r{i}-" + "v" * 200}] for i in range(8)]
  464. table_text = f'<table id="tb-1" format="json">{json.dumps(rows)}</table>'
  465. paragraphs = [
  466. {"text": "Sufficiently long lead paragraph " * 30, "is_table": False},
  467. {"text": table_text, "is_table": True},
  468. ]
  469. sub_blocks = _split_long_block(
  470. paragraphs,
  471. heading="Heading",
  472. parent_headings=[],
  473. level=2,
  474. table_chunk_role="none",
  475. tokenizer=tokenizer,
  476. target_max=600,
  477. target_ideal=450,
  478. )
  479. # Every sub-block respects the cap.
  480. assert all(b["tokens"] <= 600 for b in sub_blocks)
  481. # At least one sub-block keeps an unbroken <table> fragment somewhere
  482. # in its content (proof that row-boundary preservation kicked in).
  483. contents = [b["content"] for b in sub_blocks]
  484. assert any(
  485. ("<table " in c and "</table>" in c) for c in contents
  486. ), "expected at least one sub-block to retain a legal <table> fragment"
  487. @pytest.mark.offline
  488. def test_split_table_text_budgets_wrapper_overhead_for_target_max():
  489. # ``_split_rows_by_tokens`` measures only the body (json.dumps(rows));
  490. # the surrounding ``<table {attrs}></table>`` wrapper costs tokens too.
  491. # Without wrapper-aware budgeting, a chunk whose body just fits
  492. # target_max would overflow once wrapped and trigger character
  493. # fallback — shredding the row structure for no good reason.
  494. tokenizer = _make_tokenizer()
  495. # A long attrs string forces a non-trivial wrapper overhead so the
  496. # body-only budget previously chosen (==target_max) overflows when
  497. # the wrapper is added back in.
  498. attrs_padding = "x" * 80
  499. rows = [[{"col": "y" * 80}] for _ in range(4)]
  500. table_text = f'<table id="{attrs_padding}" format="json">{json.dumps(rows)}</table>'
  501. pieces = _split_table_text(
  502. table_text,
  503. tokenizer=tokenizer,
  504. target_max=250,
  505. target_ideal=180,
  506. last_min=64,
  507. )
  508. # Every output piece honors the cap.
  509. assert all(_count_tokens(tokenizer, p) <= 250 for p in pieces), [
  510. _count_tokens(tokenizer, p) for p in pieces
  511. ]
  512. # Row structure preserved — none of the pieces fell back to
  513. # character fragments because of accidental wrapper overflow.
  514. assert all(p.startswith("<table ") and p.endswith("</table>") for p in pieces)
  515. def _count_tokens(tokenizer: Tokenizer, text: str) -> int:
  516. return len(tokenizer.encode(text))