parse_document.py 79 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914
  1. #!/usr/bin/env python3
  2. """
  3. ABOUTME: Parses DOCX documents into text blocks using python-docx
  4. ABOUTME: Extracts automatic numbering, splits by headings, converts tables to JSON
  5. """
  6. import json
  7. import sys
  8. try:
  9. from docx import Document
  10. except ImportError:
  11. print(
  12. "Error: python-docx not installed. Run: pip install python-docx",
  13. file=sys.stderr,
  14. )
  15. sys.exit(1)
  16. from .numbering_resolver import NumberingResolver
  17. from .table_extractor import TableExtractor
  18. from .utils import estimate_tokens
  19. from .drawing_image_extractor import (
  20. DrawingExtractionContext,
  21. extract_drawing_placeholder_from_element,
  22. extract_vml_image_placeholder_from_element,
  23. )
  24. # Constants for content validation (character-based for UI/display)
  25. MAX_HEADING_LENGTH = 200 # Maximum heading length in characters (UI constraint)
  26. MAX_ANCHOR_CANDIDATE_LENGTH = (
  27. 100 # Maximum length for candidate anchor paragraphs (characters)
  28. )
  29. # Constants for content splitting (token-based for LLM context management)
  30. IDEAL_BLOCK_CONTENT_TOKENS = 6000 # Ideal target size for balanced splitting (tokens)
  31. MAX_BLOCK_CONTENT_TOKENS = 8000 # Maximum block content (tokens, hard limit)
  32. SMALL_TAIL_THRESHOLD = (
  33. MAX_BLOCK_CONTENT_TOKENS - IDEAL_BLOCK_CONTENT_TOKENS
  34. ) // 2 # Threshold for tail absorption (1000 tokens)
  35. # Constants for table splitting (token-based)
  36. TABLE_IDEAL_TOKENS = 3000 # Ideal target size for table chunks (tokens)
  37. TABLE_MAX_TOKENS = 5000 # Maximum table size before splitting (tokens), must smaller than IDEAL_BLOCK_CONTENT_TOKENS
  38. TABLE_MIN_LAST_CHUNK_TOKENS = int(
  39. (TABLE_MAX_TOKENS - TABLE_IDEAL_TOKENS) * 0.8
  40. ) # Minimum size for last chunk to avoid tiny fragments
  41. TABLE_CHUNK_SUFFIX_LABEL = "表格片段" # Label prefix for split table chunk headings
  42. # OOXML tracked-change/comment tags whose subtree must be dropped so we only
  43. # surface the *final* revised text. w:ins / w:moveTo are kept via default
  44. # recursion so inserted/moved-in content survives.
  45. _SKIP_REVISION_TAGS = frozenset({"del", "moveFrom"})
  46. _SKIP_COMMENT_TAGS = frozenset(
  47. {"commentRangeStart", "commentRangeEnd", "commentReference", "annotationRef"}
  48. )
  49. _SKIP_PARAGRAPH_TAGS = _SKIP_REVISION_TAGS | _SKIP_COMMENT_TAGS
  50. def print_error(title: str, details: str, solution: str):
  51. """
  52. Print a friendly, formatted error message.
  53. Args:
  54. title: Error title
  55. details: Detailed error information
  56. solution: Suggested solution steps
  57. """
  58. print("\n" + "=" * 80, file=sys.stderr)
  59. print(f"ERROR: {title}", file=sys.stderr)
  60. print("=" * 80, file=sys.stderr)
  61. print(f"\n{details}", file=sys.stderr)
  62. print("\nSOLUTION:", file=sys.stderr)
  63. print(solution, file=sys.stderr)
  64. print("\n" + "=" * 80 + "\n", file=sys.stderr)
  65. def truncate_heading(heading_text: str, para_id: str = None) -> str:
  66. """
  67. Truncate heading if it exceeds MAX_HEADING_LENGTH.
  68. Args:
  69. heading_text: The heading text to check
  70. para_id: Optional paragraph ID for warning message
  71. Returns:
  72. str: Original heading if within limit, truncated heading with "..." if too long
  73. """
  74. if len(heading_text) > MAX_HEADING_LENGTH:
  75. truncated = heading_text[: MAX_HEADING_LENGTH - 3] + "..."
  76. location = f" (para_id: {para_id})" if para_id else ""
  77. print(
  78. f"Warning: Heading truncated (length {len(heading_text)} > max {MAX_HEADING_LENGTH}){location}: "
  79. f'"{truncated}"',
  80. file=sys.stderr,
  81. )
  82. return truncated
  83. return heading_text
  84. def validate_heading_length(heading_text: str, para_id: str):
  85. """
  86. Validate that heading length does not exceed MAX_HEADING_LENGTH.
  87. Args:
  88. heading_text: The heading text to validate
  89. para_id: The paragraph ID for error reporting
  90. Exits:
  91. sys.exit(1) if heading exceeds maximum length
  92. """
  93. if len(heading_text) > MAX_HEADING_LENGTH:
  94. preview = (
  95. heading_text[:100] + "..." if len(heading_text) > 100 else heading_text
  96. )
  97. print_error(
  98. f"Heading too long ({len(heading_text)} characters, max {MAX_HEADING_LENGTH})",
  99. f'The following heading exceeds the maximum allowed length:\n\n "{preview}"\n\n'
  100. f"Location: Paragraph ID {para_id}\n"
  101. f"Actual length: {len(heading_text)} characters",
  102. " 1. Open the document in Microsoft Word\n"
  103. f" 2. Shorten this heading to {MAX_HEADING_LENGTH} characters or less\n"
  104. " 3. Re-upload it to LightRAG",
  105. )
  106. sys.exit(1)
  107. def validate_table_tokens(table_json: str, block_heading: str):
  108. """
  109. Validate that table JSON does not exceed MAX_BLOCK_CONTENT_TOKENS.
  110. Args:
  111. table_json: The JSON representation of the table
  112. block_heading: The heading of the block containing this table
  113. Exits:
  114. sys.exit(1) if table exceeds maximum token limit
  115. """
  116. table_tokens = estimate_tokens(table_json)
  117. if table_tokens > MAX_BLOCK_CONTENT_TOKENS:
  118. print_error(
  119. f"Table too large (~{table_tokens} tokens, max {MAX_BLOCK_CONTENT_TOKENS})",
  120. f"A table in the document is too large for LLM processing.\n\n"
  121. f'Location: Under heading "{block_heading}"\n'
  122. f"Table size: ~{table_tokens} tokens ({len(table_json)} characters)\n\n"
  123. "Large tables can cause issues with file chunking.",
  124. " 1. Open the document in Microsoft Word\n"
  125. f' 2. Locate the table under heading "{block_heading}"\n'
  126. " 3. Split the table into smaller tables, or\n"
  127. " 4. Simplify the table content\n"
  128. " 5. Re-upload it to LightRAG",
  129. )
  130. sys.exit(1)
  131. def find_first_valid_para_id(para_ids: list) -> str | None:
  132. """
  133. Find the first valid paraId in a 2D array of paraIds.
  134. Args:
  135. para_ids: 2D list of paraIds from table cells
  136. Returns:
  137. First non-None paraId found, or None when every cell lacks a paraId.
  138. Callers must tolerate ``None`` and treat it as a tracking gap rather
  139. than a fatal error (legacy / non-Word docx authors omit ``w14:paraId``
  140. attributes and we want to keep parsing).
  141. """
  142. for row in para_ids:
  143. for para_id in row:
  144. if para_id:
  145. return para_id
  146. return None
  147. def find_last_valid_para_id(para_ids: list) -> str | None:
  148. """
  149. Find the last valid paraId in a 2D array of paraIds.
  150. Returns the last non-None paraId, falling back to the first valid one
  151. when reverse-iteration does not yield anything (single-paraId tables),
  152. and finally ``None`` when every cell lacks a paraId.
  153. """
  154. for row in reversed(para_ids):
  155. for para_id in reversed(row):
  156. if para_id:
  157. return para_id
  158. return find_first_valid_para_id(para_ids)
  159. def _table_has_any_paraid(para_ids: list) -> bool:
  160. """True when at least one cell in the 2D paraId grid carries an id."""
  161. return find_first_valid_para_id(para_ids) is not None
  162. def split_table(
  163. table_rows: list,
  164. para_ids: list,
  165. para_ids_end: list,
  166. header_indices: list,
  167. debug: bool = False,
  168. ) -> list:
  169. """
  170. Split large table into chunks at row boundaries.
  171. Splitting Strategy:
  172. 1. Only split if table JSON exceeds TABLE_MAX_TOKENS (5000 tokens)
  173. 2. Calculate target chunks based on TABLE_IDEAL_TOKENS (3000 tokens)
  174. 3. Split at row boundaries to achieve balanced chunk sizes
  175. 4. Avoid very small last chunk: if last chunk < 1000 tokens, merge with previous
  176. 5. Extract first valid paraId for each chunk as UUID
  177. Output Strategy:
  178. - First chunk: Merges with preceding content, uses original heading
  179. - Middle chunks: Standalone blocks with heading suffix [1], [2], etc.
  180. - Last chunk: Merges with following content, carries the cross-page
  181. ``_table_header`` so the host block can surface it via ``table_headers``
  182. - The cross-page repeating header rows (extracted from ``w:tblHeader``)
  183. flow per-table into each containing block's ``table_headers`` list
  184. Args:
  185. table_rows: 2D array of table content
  186. para_ids: 2D array of paraIds - first paraId in each cell (for uuid)
  187. para_ids_end: 2D array of paraIds - last paraId in each cell (for uuid_end)
  188. header_indices: List of row indices that are table headers
  189. debug: If True, output debug information
  190. Returns:
  191. List of chunk dicts: [{
  192. 'rows': 2D array subset,
  193. 'para_ids': 2D array subset,
  194. 'para_ids_end': 2D array subset,
  195. 'uuid': first valid paraId in chunk,
  196. 'is_first': True if first chunk,
  197. 'is_last': True if last chunk
  198. }, ...]
  199. """
  200. import math
  201. # Calculate total JSON token count
  202. total_json = json.dumps(table_rows, ensure_ascii=False)
  203. total_tokens = estimate_tokens(total_json)
  204. if total_tokens <= TABLE_MAX_TOKENS:
  205. # No splitting needed
  206. uuid = find_first_valid_para_id(para_ids)
  207. return [
  208. {
  209. "rows": table_rows,
  210. "para_ids": para_ids,
  211. "para_ids_end": para_ids_end,
  212. "uuid": uuid,
  213. "is_first": True,
  214. "is_last": True,
  215. }
  216. ]
  217. # Need to split - calculate target number of chunks
  218. target_chunks = math.ceil(total_tokens / TABLE_IDEAL_TOKENS)
  219. min_chunks_needed = math.ceil(total_tokens / TABLE_MAX_TOKENS)
  220. target_chunks = max(target_chunks, min_chunks_needed)
  221. # Split at row boundaries
  222. chunks = []
  223. num_rows = len(table_rows)
  224. target_rows_per_chunk = num_rows / target_chunks
  225. start_row = 0
  226. for i in range(target_chunks):
  227. # Calculate end row for this chunk
  228. if i == target_chunks - 1:
  229. # Last chunk gets all remaining rows
  230. end_row = num_rows
  231. else:
  232. # Target end row (rounded)
  233. end_row = min(int((i + 1) * target_rows_per_chunk), num_rows)
  234. # Adjust to avoid very small last chunk
  235. rows_remaining = num_rows - end_row
  236. if rows_remaining > 0 and rows_remaining < target_rows_per_chunk * 0.3:
  237. # Last chunk would be too small, expand this chunk
  238. end_row = num_rows
  239. # Extract chunk
  240. chunk_rows = table_rows[start_row:end_row]
  241. chunk_para_ids = para_ids[start_row:end_row]
  242. chunk_para_ids_end = para_ids_end[start_row:end_row]
  243. if chunk_rows:
  244. chunk_uuid = find_first_valid_para_id(chunk_para_ids)
  245. chunks.append(
  246. {
  247. "rows": chunk_rows,
  248. "para_ids": chunk_para_ids,
  249. "para_ids_end": chunk_para_ids_end,
  250. "uuid": chunk_uuid,
  251. "is_first": (i == 0),
  252. "is_last": (end_row >= num_rows),
  253. }
  254. )
  255. start_row = end_row
  256. if start_row >= num_rows:
  257. break
  258. # Post-processing: Merge very small last chunk with previous chunk if possible
  259. if len(chunks) >= 2:
  260. last_chunk = chunks[-1]
  261. last_chunk_json = json.dumps(last_chunk["rows"], ensure_ascii=False)
  262. last_chunk_tokens = estimate_tokens(last_chunk_json)
  263. if last_chunk_tokens < TABLE_MIN_LAST_CHUNK_TOKENS:
  264. # Try to merge with previous chunk
  265. prev_chunk = chunks[-2]
  266. # Calculate combined size
  267. combined_rows = prev_chunk["rows"] + last_chunk["rows"]
  268. combined_json = json.dumps(combined_rows, ensure_ascii=False)
  269. combined_tokens = estimate_tokens(combined_json)
  270. # Only merge if combined size doesn't exceed max limit
  271. if combined_tokens <= TABLE_MAX_TOKENS:
  272. # Merge the chunks
  273. merged_para_ids = prev_chunk["para_ids"] + last_chunk["para_ids"]
  274. merged_para_ids_end = (
  275. prev_chunk["para_ids_end"] + last_chunk["para_ids_end"]
  276. )
  277. chunks[-2] = {
  278. "rows": combined_rows,
  279. "para_ids": merged_para_ids,
  280. "para_ids_end": merged_para_ids_end,
  281. "uuid": prev_chunk["uuid"], # Keep UUID of first chunk
  282. "is_first": prev_chunk["is_first"],
  283. "is_last": True, # This becomes the last chunk
  284. }
  285. chunks.pop() # Remove the last chunk
  286. if debug:
  287. print(
  288. f"[DEBUG] Merged small last chunk (~{last_chunk_tokens} tokens) with previous chunk",
  289. file=sys.stderr,
  290. )
  291. print(
  292. f" Combined size: ~{combined_tokens} tokens", file=sys.stderr
  293. )
  294. return chunks
  295. def split_table_with_heading(
  296. table_rows: list,
  297. para_ids: list,
  298. para_ids_end: list,
  299. header_indices: list,
  300. current_heading: str,
  301. start_suffix: int = 0,
  302. debug: bool = False,
  303. ) -> list:
  304. """
  305. Wrapper for split_table that includes heading information in debug output.
  306. Supports sequential numbering when multiple tables are split in the same block.
  307. Args:
  308. table_rows: 2D array of table content
  309. para_ids: 2D array of paraIds - first paraId in each cell (for uuid)
  310. para_ids_end: 2D array of paraIds - last paraId in each cell (for uuid_end)
  311. header_indices: List of row indices that are table headers
  312. current_heading: Current block heading (for generating chunk headings)
  313. start_suffix: Starting suffix number for non-first chunks (default: 0)
  314. When multiple tables in the same block are split, this ensures
  315. sequential numbering (e.g., [1], [2] for first table, [3], [4] for second)
  316. debug: If True, output debug information with headings
  317. Returns:
  318. Same as split_table(), with each chunk having suffix calculated from start_suffix
  319. """
  320. chunks = split_table(
  321. table_rows, para_ids, para_ids_end, header_indices, debug=False
  322. )
  323. # Add suffix_number to each chunk for later use
  324. for i, chunk in enumerate(chunks):
  325. if i == 0:
  326. chunk["suffix_number"] = None # First chunk has no suffix
  327. else:
  328. chunk["suffix_number"] = start_suffix + i
  329. # Debug output with headings
  330. if debug and len(chunks) > 1:
  331. print(
  332. f"\n[DEBUG] Table split into {len(chunks)} chunks (final)", file=sys.stderr
  333. )
  334. for i, chunk in enumerate(chunks):
  335. chunk_json = json.dumps(chunk["rows"], ensure_ascii=False)
  336. # Generate heading for this chunk
  337. if chunk["suffix_number"] is None:
  338. chunk_heading = current_heading
  339. else:
  340. chunk_heading = f"{current_heading} [{TABLE_CHUNK_SUFFIX_LABEL}{chunk['suffix_number']}]"
  341. print(
  342. f" Chunk {i+1}: heading=\"{chunk_heading}\", {len(chunk['rows'])} rows, {len(chunk_json)} chars",
  343. file=sys.stderr,
  344. )
  345. return chunks
  346. def merge_small_blocks(blocks: list, debug: bool = False) -> tuple:
  347. """
  348. Merge blocks below IDEAL_BLOCK_CONTENT_TOKENS following bottom-up, level-aware strategy.
  349. Strategy (bottom-up approach):
  350. 1. Process levels from deepest (largest number) to shallowest (level 1)
  351. 2. For each level:
  352. - Phase A: Same-level merging - merge adjacent blocks of same level
  353. - Phase B: Cross-level absorption - allow higher levels to absorb current level
  354. 3. Table chunk role restrictions:
  355. - 'middle': cannot merge with any block
  356. - 'first': can only merge forward (with next block)
  357. - 'last': can only merge backward (with previous block)
  358. - 'none': no restrictions
  359. 4. Stop merging a block once it reaches IDEAL_BLOCK_CONTENT_TOKENS (locked)
  360. 5. Reject merge if combined size > MAX_BLOCK_CONTENT_TOKENS
  361. 6. Merged block's level = level of the block whose heading is kept
  362. Args:
  363. blocks: List of block dictionaries with 'level' and 'table_chunk_role' fields
  364. debug: If True, output debug information and return merge count
  365. Returns:
  366. Tuple of (merged_blocks, merge_count)
  367. """
  368. if len(blocks) <= 1:
  369. return blocks, 0
  370. merged_count = 0
  371. result = blocks.copy()
  372. # Find all unique levels and sort from deepest to shallowest
  373. levels = sorted(set(block.get("level", 1) for block in result), reverse=True)
  374. if debug:
  375. print(
  376. f"\n[DEBUG] merge_small_blocks: Processing {len(result)} blocks across levels {levels}",
  377. file=sys.stderr,
  378. )
  379. # Process each level from deepest to shallowest
  380. for current_level in levels:
  381. if debug:
  382. print(f"[DEBUG] Processing level {current_level}", file=sys.stderr)
  383. # Phase A: Same-level merging
  384. changed = True
  385. iteration = 0
  386. while changed:
  387. iteration += 1
  388. changed = False
  389. i = 0
  390. new_result = []
  391. while i < len(result):
  392. current_block = result[i]
  393. current_tokens = estimate_tokens(current_block["content"])
  394. block_level = current_block.get("level", 1)
  395. current_role = current_block.get("table_chunk_role", "none")
  396. # Only process blocks of current level that are below IDEAL and not locked
  397. is_below_ideal = (
  398. current_tokens < IDEAL_BLOCK_CONTENT_TOKENS and current_tokens > 0
  399. )
  400. is_current_level = block_level == current_level
  401. if is_below_ideal and is_current_level:
  402. merged = False
  403. # Check table chunk role restrictions
  404. can_merge_forward = current_role in ["none", "first"]
  405. can_merge_backward = current_role in ["none", "last"]
  406. # Try forward merge with next block (only same level in Phase A)
  407. if can_merge_forward and i + 1 < len(result):
  408. next_block = result[i + 1]
  409. next_level = next_block.get("level", 1)
  410. next_role = next_block.get("table_chunk_role", "none")
  411. next_can_merge_backward = next_role in ["none", "last"]
  412. # Phase A: Only merge same-level blocks
  413. if next_level == current_level and next_can_merge_backward:
  414. merged_content = (
  415. current_block["content"]
  416. + "\n\n"
  417. + next_block["content"]
  418. )
  419. combined_tokens = estimate_tokens(merged_content)
  420. if combined_tokens <= MAX_BLOCK_CONTENT_TOKENS:
  421. merged_block = {
  422. "uuid": current_block["uuid"],
  423. "uuid_end": next_block.get(
  424. "uuid_end", next_block["uuid"]
  425. ),
  426. "heading": current_block["heading"],
  427. "content": merged_content,
  428. "type": "text",
  429. "parent_headings": current_block["parent_headings"],
  430. "level": current_level,
  431. "table_chunk_role": "none",
  432. }
  433. combined_headers = current_block.get(
  434. "table_headers", []
  435. ) + next_block.get("table_headers", [])
  436. if combined_headers:
  437. merged_block["table_headers"] = combined_headers
  438. new_result.append(merged_block)
  439. merged = True
  440. merged_count += 1
  441. changed = True
  442. i += 2
  443. continue
  444. # Try backward merge with previous (only same level in Phase A)
  445. if not merged and can_merge_backward and len(new_result) > 0:
  446. prev_block = new_result[-1]
  447. prev_level = prev_block.get("level", 1)
  448. prev_role = prev_block.get("table_chunk_role", "none")
  449. prev_tokens = estimate_tokens(prev_block["content"])
  450. prev_can_merge_forward = prev_role in ["none", "first"]
  451. prev_below_ideal = prev_tokens < IDEAL_BLOCK_CONTENT_TOKENS
  452. # Phase A: Only merge same-level blocks, and prev must be below IDEAL
  453. if (
  454. prev_level == current_level
  455. and prev_can_merge_forward
  456. and prev_below_ideal
  457. ):
  458. merged_content = (
  459. prev_block["content"]
  460. + "\n\n"
  461. + current_block["content"]
  462. )
  463. combined_tokens = estimate_tokens(merged_content)
  464. if combined_tokens <= MAX_BLOCK_CONTENT_TOKENS:
  465. merged_block = {
  466. "uuid": prev_block["uuid"],
  467. "uuid_end": current_block.get(
  468. "uuid_end", current_block["uuid"]
  469. ),
  470. "heading": prev_block["heading"],
  471. "content": merged_content,
  472. "type": "text",
  473. "parent_headings": prev_block["parent_headings"],
  474. "level": current_level,
  475. "table_chunk_role": "none",
  476. }
  477. combined_headers = prev_block.get(
  478. "table_headers", []
  479. ) + current_block.get("table_headers", [])
  480. if combined_headers:
  481. merged_block["table_headers"] = combined_headers
  482. new_result[-1] = merged_block
  483. merged = True
  484. merged_count += 1
  485. changed = True
  486. i += 1
  487. continue
  488. # No merge happened, keep block
  489. if not merged:
  490. new_result.append(current_block)
  491. i += 1
  492. else:
  493. # Current block is at or above IDEAL, or not current level
  494. # Check for tail absorption: if remaining same-level blocks are small enough, absorb them all
  495. if (
  496. is_current_level
  497. and current_tokens >= IDEAL_BLOCK_CONTENT_TOKENS
  498. ):
  499. # Calculate total size of remaining same-level blocks
  500. remaining_same_level_tokens = 0
  501. remaining_end_idx = i + 1
  502. for j in range(i + 1, len(result)):
  503. next_block = result[j]
  504. next_level = next_block.get("level", 1)
  505. # Stop when we encounter a different level
  506. if next_level != current_level:
  507. break
  508. # Check if this block can be absorbed (table_chunk_role constraints)
  509. next_role = next_block.get("table_chunk_role", "none")
  510. if next_role == "middle":
  511. # Middle chunks cannot be absorbed - stop here
  512. break
  513. remaining_same_level_tokens += estimate_tokens(
  514. next_block["content"]
  515. )
  516. remaining_end_idx = j + 1
  517. # If remaining same-level blocks are small enough, absorb them all
  518. if (
  519. remaining_same_level_tokens > 0
  520. and remaining_same_level_tokens < SMALL_TAIL_THRESHOLD
  521. ):
  522. # Check if combined size doesn't exceed MAX
  523. combined_tokens = (
  524. current_tokens + remaining_same_level_tokens
  525. )
  526. if combined_tokens <= MAX_BLOCK_CONTENT_TOKENS:
  527. # Absorb all remaining same-level blocks
  528. absorbed_content = current_block["content"]
  529. last_uuid_end = current_block.get(
  530. "uuid_end", current_block["uuid"]
  531. )
  532. combined_headers = list(
  533. current_block.get("table_headers", [])
  534. )
  535. for j in range(i + 1, remaining_end_idx):
  536. next_block = result[j]
  537. absorbed_content += "\n\n" + next_block["content"]
  538. last_uuid_end = next_block.get(
  539. "uuid_end", next_block["uuid"]
  540. )
  541. combined_headers.extend(
  542. next_block.get("table_headers", [])
  543. )
  544. # Create merged block
  545. merged_block = {
  546. "uuid": current_block["uuid"],
  547. "uuid_end": last_uuid_end,
  548. "heading": current_block["heading"],
  549. "content": absorbed_content,
  550. "type": "text",
  551. "parent_headings": current_block["parent_headings"],
  552. "level": current_level,
  553. "table_chunk_role": "none",
  554. }
  555. if combined_headers:
  556. merged_block["table_headers"] = combined_headers
  557. new_result.append(merged_block)
  558. merged_count += remaining_end_idx - i - 1
  559. changed = True
  560. i = remaining_end_idx
  561. if debug:
  562. num_absorbed = remaining_end_idx - i - 1
  563. print(
  564. f" Tail absorption: block at IDEAL ({current_tokens} tokens) absorbed {num_absorbed} small tail blocks ({remaining_same_level_tokens} tokens)",
  565. file=sys.stderr,
  566. )
  567. continue
  568. # No tail absorption, keep block as-is
  569. new_result.append(current_block)
  570. i += 1
  571. result = new_result
  572. if debug and changed:
  573. print(
  574. f" Phase A iteration {iteration}: {merged_count} total merges",
  575. file=sys.stderr,
  576. )
  577. # Phase B: Cross-level absorption (allow higher levels to absorb current level)
  578. changed = True
  579. iteration = 0
  580. while changed:
  581. iteration += 1
  582. changed = False
  583. i = 0
  584. new_result = []
  585. while i < len(result):
  586. current_block = result[i]
  587. current_tokens = estimate_tokens(current_block["content"])
  588. block_level = current_block.get("level", 1)
  589. current_role = current_block.get("table_chunk_role", "none")
  590. # Only process blocks of current level that are below IDEAL
  591. is_below_ideal = (
  592. current_tokens < IDEAL_BLOCK_CONTENT_TOKENS and current_tokens > 0
  593. )
  594. is_current_level = block_level == current_level
  595. if is_below_ideal and is_current_level:
  596. merged = False
  597. can_merge_forward = current_role in ["none", "first", "last"]
  598. can_merge_backward = current_role in ["none", "last"]
  599. # Try forward merge (current can absorb deeper levels)
  600. if can_merge_forward and i + 1 < len(result):
  601. next_block = result[i + 1]
  602. next_level = next_block.get("level", 1)
  603. next_role = next_block.get("table_chunk_role", "none")
  604. next_can_merge_backward = next_role in ["none", "last"]
  605. # Phase B: current level can absorb deeper levels (larger numbers)
  606. if next_level > current_level and next_can_merge_backward:
  607. merged_content = (
  608. current_block["content"]
  609. + "\n\n"
  610. + next_block["content"]
  611. )
  612. combined_tokens = estimate_tokens(merged_content)
  613. if combined_tokens <= MAX_BLOCK_CONTENT_TOKENS:
  614. merged_block = {
  615. "uuid": current_block["uuid"],
  616. "uuid_end": next_block.get(
  617. "uuid_end", next_block["uuid"]
  618. ),
  619. "heading": current_block["heading"],
  620. "content": merged_content,
  621. "type": "text",
  622. "parent_headings": current_block["parent_headings"],
  623. "level": current_level,
  624. "table_chunk_role": "none",
  625. }
  626. combined_headers = current_block.get(
  627. "table_headers", []
  628. ) + next_block.get("table_headers", [])
  629. if combined_headers:
  630. merged_block["table_headers"] = combined_headers
  631. new_result.append(merged_block)
  632. merged = True
  633. merged_count += 1
  634. changed = True
  635. i += 2
  636. continue
  637. # Try backward merge (higher level can absorb current)
  638. if not merged and can_merge_backward and len(new_result) > 0:
  639. prev_block = new_result[-1]
  640. prev_level = prev_block.get("level", 1)
  641. prev_role = prev_block.get("table_chunk_role", "none")
  642. prev_tokens = estimate_tokens(prev_block["content"])
  643. prev_can_merge_forward = prev_role in ["none", "first", "last"]
  644. prev_below_ideal = prev_tokens < IDEAL_BLOCK_CONTENT_TOKENS
  645. # Phase B: higher level (smaller number) can absorb current level
  646. if (
  647. prev_level < current_level
  648. and prev_can_merge_forward
  649. and prev_below_ideal
  650. ):
  651. merged_content = (
  652. prev_block["content"]
  653. + "\n\n"
  654. + current_block["content"]
  655. )
  656. combined_tokens = estimate_tokens(merged_content)
  657. if combined_tokens <= MAX_BLOCK_CONTENT_TOKENS:
  658. merged_block = {
  659. "uuid": prev_block["uuid"],
  660. "uuid_end": current_block.get(
  661. "uuid_end", current_block["uuid"]
  662. ),
  663. "heading": prev_block["heading"],
  664. "content": merged_content,
  665. "type": "text",
  666. "parent_headings": prev_block["parent_headings"],
  667. "level": prev_level,
  668. "table_chunk_role": "none",
  669. }
  670. combined_headers = prev_block.get(
  671. "table_headers", []
  672. ) + current_block.get("table_headers", [])
  673. if combined_headers:
  674. merged_block["table_headers"] = combined_headers
  675. new_result[-1] = merged_block
  676. merged = True
  677. merged_count += 1
  678. changed = True
  679. i += 1
  680. continue
  681. if not merged:
  682. new_result.append(current_block)
  683. i += 1
  684. else:
  685. new_result.append(current_block)
  686. i += 1
  687. result = new_result
  688. if debug and changed:
  689. print(
  690. f" Phase B iteration {iteration}: {merged_count} total merges",
  691. file=sys.stderr,
  692. )
  693. if debug:
  694. print(
  695. f"[DEBUG] merge_small_blocks complete: {len(result)} blocks, {merged_count} total merges",
  696. file=sys.stderr,
  697. )
  698. # Check for oversized blocks and print debug information
  699. oversized_blocks = []
  700. for idx, block in enumerate(result):
  701. block_tokens = estimate_tokens(block["content"])
  702. if block_tokens > 0: # MAX_BLOCK_CONTENT_TOKENS:
  703. oversized_blocks.append(
  704. {
  705. "index": idx,
  706. "heading": block.get("heading", "(no heading)"),
  707. "level": block.get("level", "N/A"),
  708. "tokens": block_tokens,
  709. "has_table_header": bool(block.get("table_headers")),
  710. "content_preview": block["content"][:200],
  711. }
  712. )
  713. if oversized_blocks:
  714. print(
  715. f"\n[WARNING] Found {len(oversized_blocks)} oversized blocks after merging:",
  716. file=sys.stderr,
  717. )
  718. for info in oversized_blocks:
  719. print(
  720. f" Block #{info['index']}: level={info['level']}, tokens={info['tokens']}, heading=\"{info['heading']}\"",
  721. file=sys.stderr,
  722. )
  723. return result, merged_count
  724. def split_long_block(
  725. block_heading: str,
  726. paragraphs: list,
  727. parent_headings: list,
  728. block_level: int,
  729. debug: bool = False,
  730. ) -> list:
  731. """
  732. Split a long text block into smaller blocks using anchor paragraphs.
  733. Strategy (improved for balanced splitting):
  734. 1. Calculate target number of blocks based on IDEAL_BLOCK_CONTENT_TOKENS
  735. 2. Ensure minimum blocks needed to stay under MAX_BLOCK_CONTENT_TOKENS
  736. 3. Find all candidate anchor paragraphs (<= MAX_ANCHOR_CANDIDATE_LENGTH chars)
  737. 4. Select anchors closest to ideal split positions for balanced distribution
  738. 5. Create blocks using selected anchors as new headings
  739. Important: Tables are NOT split by this function.
  740. - Tables are already split at row boundaries by split_table() if needed (TABLE_MAX_TOKENS limit)
  741. - Table paragraphs (is_table=True) are excluded from anchor candidate selection
  742. - Table content remains intact and is not re-split into smaller table chunks
  743. - If a block contains both text and table chunks exceeding the limit, only text
  744. paragraphs are used as split points; table chunks stay complete
  745. Args:
  746. block_heading: Original heading text
  747. paragraphs: List of dicts with 'text', 'para_id', and 'is_table' keys
  748. parent_headings: Parent heading stack
  749. block_level: Heading level of this block (1=Heading 1, 2=Heading 2, etc.)
  750. debug: If True, output debug information when splitting occurs
  751. Returns:
  752. List of block dictionaries (may be split into multiple blocks), each with 'level' field
  753. Exits:
  754. sys.exit(1) if no suitable anchor found and content exceeds limit
  755. """
  756. import math
  757. # Check if this block starts with a split table chunk (has _chunk_heading metadata)
  758. # If so, use that heading instead of block_heading
  759. effective_heading = block_heading
  760. if paragraphs and paragraphs[0].get("_chunk_heading"):
  761. effective_heading = paragraphs[0]["_chunk_heading"]
  762. # Calculate total content token count
  763. total_content = "\n".join(p["text"] for p in paragraphs)
  764. total_tokens = estimate_tokens(total_content)
  765. if total_tokens <= MAX_BLOCK_CONTENT_TOKENS:
  766. # Within limit, return as single block
  767. # Use first paragraph's para_id as UUID
  768. # For uuid_end: use para_id_end if last element is a table, otherwise para_id
  769. last_para = paragraphs[-1] if paragraphs else {}
  770. uuid_end = last_para.get("para_id_end") or last_para.get("para_id")
  771. block = {
  772. "uuid": paragraphs[0]["para_id"] if paragraphs else None,
  773. "uuid_end": uuid_end,
  774. "heading": effective_heading,
  775. "content": total_content,
  776. "type": "text",
  777. "parent_headings": parent_headings,
  778. "level": block_level, # Add level to block
  779. }
  780. # Collect per-table cross-page headers (aligned with <table> tag order)
  781. table_headers = _collect_table_headers(paragraphs)
  782. if table_headers:
  783. block["table_headers"] = table_headers
  784. return [block]
  785. # Content exceeds limit, need to split
  786. # Calculate target number of blocks based on IDEAL_BLOCK_CONTENT_TOKENS
  787. target_blocks = math.ceil(total_tokens / IDEAL_BLOCK_CONTENT_TOKENS)
  788. # Ensure we have enough blocks to stay under MAX_BLOCK_CONTENT_TOKENS
  789. min_blocks_needed = math.ceil(total_tokens / MAX_BLOCK_CONTENT_TOKENS)
  790. target_blocks = max(target_blocks, min_blocks_needed)
  791. # Calculate ideal token size per block
  792. target_size = total_tokens / target_blocks
  793. # Find candidate anchors (short paragraphs, excluding tables and empty placeholders)
  794. # Use character length for anchor candidate selection (UI/readability constraint)
  795. candidates = []
  796. cumulative_tokens = 0
  797. for idx, para in enumerate(paragraphs):
  798. if (
  799. not para.get("is_table", False)
  800. and 0 < len(para["text"]) <= MAX_ANCHOR_CANDIDATE_LENGTH
  801. ):
  802. candidates.append(
  803. {
  804. "index": idx,
  805. "text": para["text"],
  806. "para_id": para["para_id"],
  807. "position": cumulative_tokens,
  808. }
  809. )
  810. cumulative_tokens += estimate_tokens(para["text"])
  811. if not candidates:
  812. # No suitable anchor found
  813. preview = (
  814. block_heading[:80] + "..." if len(block_heading) > 80 else block_heading
  815. )
  816. print_error(
  817. "Cannot split long block (no suitable anchor paragraphs found)",
  818. f"A text block is too long (~{total_tokens} tokens, max {MAX_BLOCK_CONTENT_TOKENS})\n"
  819. f"but no paragraphs <= {MAX_ANCHOR_CANDIDATE_LENGTH} characters were found to use as split points.\n\n"
  820. f'Location: Under heading "{preview}"\n'
  821. f"Block size: ~{total_tokens} tokens ({len(total_content)} characters)\n"
  822. f"Number of paragraphs: {len(paragraphs)}\n"
  823. f"Calculated target blocks: {target_blocks}",
  824. " 1. Open the document in Microsoft Word\n"
  825. f' 2. Locate the section under heading "{preview}"\n'
  826. f" 3. Add short headings or paragraph breaks (≤{MAX_ANCHOR_CANDIDATE_LENGTH} chars) to divide the content\n"
  827. " 4. Re-upload it to LightRAG",
  828. )
  829. sys.exit(1)
  830. # Select anchors for splitting (target_blocks - 1 split points needed)
  831. selected_anchors = []
  832. remaining_candidates = candidates.copy()
  833. for i in range(1, target_blocks):
  834. if not remaining_candidates:
  835. break
  836. # Calculate ideal position for this split (in tokens)
  837. ideal_position = i * target_size
  838. # Find candidate closest to ideal position
  839. best_candidate = min(
  840. remaining_candidates, key=lambda c: abs(c["position"] - ideal_position)
  841. )
  842. selected_anchors.append(best_candidate)
  843. remaining_candidates.remove(best_candidate)
  844. # Sort selected anchors by index to maintain document order
  845. selected_anchors.sort(key=lambda a: a["index"])
  846. # Create blocks using selected split points
  847. result_blocks = []
  848. prev_idx = 0
  849. current_parent_headings = parent_headings
  850. current_block_heading = block_heading
  851. for anchor in selected_anchors:
  852. split_idx = anchor["index"]
  853. # Create block from prev_idx to split_idx (exclusive)
  854. block_paragraphs = paragraphs[prev_idx:split_idx]
  855. if block_paragraphs:
  856. block_content = "\n".join(p["text"] for p in block_paragraphs)
  857. # For uuid_end: use para_id_end if last element is a table, otherwise para_id
  858. last_para = block_paragraphs[-1]
  859. block_uuid_end = last_para.get("para_id_end") or last_para.get("para_id")
  860. new_block = {
  861. "uuid": block_paragraphs[0][
  862. "para_id"
  863. ], # UUID from first paragraph in content
  864. "uuid_end": block_uuid_end, # UUID_end from last paragraph (or table's last cell)
  865. "heading": current_block_heading,
  866. "content": block_content,
  867. "type": "text",
  868. "parent_headings": current_parent_headings,
  869. "_paragraphs": block_paragraphs, # Keep original paragraphs for potential re-splitting
  870. }
  871. new_table_headers = _collect_table_headers(block_paragraphs)
  872. if new_table_headers:
  873. new_block["table_headers"] = new_table_headers
  874. result_blocks.append(new_block)
  875. # Validate anchor as new heading
  876. validate_heading_length(anchor["text"], anchor["para_id"])
  877. # Update for next block
  878. current_block_heading = anchor["text"]
  879. # Update parent headings: add previous heading only if not "Preface/Uncategorized"
  880. if block_heading != "Preface/Uncategorized":
  881. current_parent_headings = parent_headings + [block_heading]
  882. prev_idx = (
  883. split_idx # Don't skip anchor - it becomes first paragraph of next block
  884. )
  885. # Create final block with remaining paragraphs
  886. final_paragraphs = paragraphs[prev_idx:]
  887. if final_paragraphs:
  888. final_content = "\n".join(p["text"] for p in final_paragraphs)
  889. # For uuid_end: use para_id_end if last element is a table, otherwise para_id
  890. last_final_para = final_paragraphs[-1]
  891. final_uuid_end = last_final_para.get("para_id_end") or last_final_para.get(
  892. "para_id"
  893. )
  894. final_block = {
  895. "uuid": final_paragraphs[0][
  896. "para_id"
  897. ], # UUID from first paragraph in content
  898. "uuid_end": final_uuid_end, # UUID_end from last paragraph (or table's last cell)
  899. "heading": current_block_heading,
  900. "content": final_content,
  901. "type": "text",
  902. "parent_headings": current_parent_headings,
  903. "_paragraphs": final_paragraphs, # Keep original paragraphs for potential re-splitting
  904. }
  905. final_table_headers = _collect_table_headers(final_paragraphs)
  906. if final_table_headers:
  907. final_block["table_headers"] = final_table_headers
  908. result_blocks.append(final_block)
  909. # Post-split validation: Check if any block still exceeds MAX_BLOCK_CONTENT_TOKENS
  910. # If so, recursively split that block (handles sparse anchor scenarios)
  911. validated_blocks = []
  912. for block in result_blocks:
  913. block_tokens = estimate_tokens(block["content"])
  914. if block_tokens > MAX_BLOCK_CONTENT_TOKENS:
  915. # This block is still too large - need to recursively split it
  916. # Use the preserved paragraph structure
  917. block_paragraphs = block.get("_paragraphs", [])
  918. if not block_paragraphs:
  919. # Fallback: shouldn't happen, but handle gracefully
  920. preview = (
  921. block["heading"][:80] + "..."
  922. if len(block["heading"]) > 80
  923. else block["heading"]
  924. )
  925. print_error(
  926. "Cannot re-split oversized block (internal error)",
  927. f"A block exceeded MAX_BLOCK_CONTENT_TOKENS but paragraph metadata was lost.\n\n"
  928. f"Location: Under heading \"{preview}\"\n"
  929. f"Block size: ~{block_tokens} tokens ({len(block['content'])} characters)",
  930. "This is an internal error. Please report this issue.",
  931. )
  932. sys.exit(1)
  933. # Recursively split this oversized block
  934. # The recursive call will either find more anchors or raise an error
  935. sub_blocks = split_long_block(
  936. block["heading"],
  937. block_paragraphs,
  938. block["parent_headings"],
  939. block_level,
  940. debug,
  941. )
  942. validated_blocks.extend(sub_blocks)
  943. else:
  944. # Remove internal _paragraphs field before adding to final output
  945. block.pop("_paragraphs", None)
  946. validated_blocks.append(block)
  947. # Add level to all blocks
  948. for block in validated_blocks:
  949. block["level"] = block_level
  950. # Output debug information if enabled and split occurred
  951. if debug and len(validated_blocks) > 1:
  952. print(f'\n[DEBUG] Block split: "{block_heading}"', file=sys.stderr)
  953. print(
  954. f" Original size: ~{total_tokens} tokens ({len(total_content)} characters)",
  955. file=sys.stderr,
  956. )
  957. block_tokens = [estimate_tokens(block["content"]) for block in validated_blocks]
  958. print(
  959. f" Final result: {len(validated_blocks)} blocks: ~{block_tokens} tokens",
  960. file=sys.stderr,
  961. )
  962. return validated_blocks
  963. def extract_para_id(para_element) -> str:
  964. """
  965. Extract w14:paraId attribute from paragraph element.
  966. Args:
  967. para_element: lxml paragraph element
  968. Returns:
  969. 8-character hex paraId, or ``None`` when the paragraph carries no
  970. ``w14:paraId`` attribute (legacy / non-Word docx authors). Callers
  971. propagate the ``None`` upward — the LightRAG adapter counts these
  972. and surfaces a single warning per document.
  973. """
  974. return para_element.get(
  975. "{http://schemas.microsoft.com/office/word/2010/wordml}paraId"
  976. )
  977. def parse_styles_outline_levels(docx_path: str) -> dict:
  978. """
  979. Parse styles.xml to extract outlineLvl definitions for each style,
  980. following style inheritance chain (basedOn).
  981. Args:
  982. docx_path: Path to DOCX file
  983. Returns:
  984. dict: styleId -> outlineLvl (0-8 for headings, 9 for body text)
  985. """
  986. import zipfile
  987. try:
  988. from defusedxml import ElementTree as ET
  989. except ImportError:
  990. from xml.etree import ElementTree as ET
  991. styles_outline = {} # styleId -> outlineLvl (directly defined)
  992. style_based_on = {} # styleId -> parent styleId
  993. try:
  994. with zipfile.ZipFile(docx_path, "r") as zf:
  995. if "word/styles.xml" not in zf.namelist():
  996. return styles_outline
  997. tree = ET.parse(zf.open("word/styles.xml"))
  998. root = tree.getroot()
  999. ns = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
  1000. # First pass: collect outlineLvl and basedOn for all styles
  1001. for style in root.findall(f".//{{{ns}}}style"):
  1002. style_id = style.get(f"{{{ns}}}styleId")
  1003. if not style_id:
  1004. continue
  1005. # Check for basedOn (style inheritance)
  1006. based_on = style.find(f"{{{ns}}}basedOn")
  1007. if based_on is not None:
  1008. parent_id = based_on.get(f"{{{ns}}}val")
  1009. if parent_id:
  1010. style_based_on[style_id] = parent_id
  1011. # Check for outlineLvl in style's pPr
  1012. pPr = style.find(f"{{{ns}}}pPr")
  1013. if pPr is not None:
  1014. outline_lvl_elem = pPr.find(f"{{{ns}}}outlineLvl")
  1015. if outline_lvl_elem is not None:
  1016. level = int(outline_lvl_elem.get(f"{{{ns}}}val"))
  1017. styles_outline[style_id] = level
  1018. # Second pass: resolve inheritance chain for styles without direct outlineLvl
  1019. def get_outline_level(style_id: str, visited: set = None) -> int:
  1020. if visited is None:
  1021. visited = set()
  1022. if style_id in visited:
  1023. return None # Prevent circular references
  1024. visited.add(style_id)
  1025. # If this style directly defines outlineLvl, return it
  1026. if style_id in styles_outline:
  1027. return styles_outline[style_id]
  1028. # Otherwise check parent style
  1029. if style_id in style_based_on:
  1030. parent_id = style_based_on[style_id]
  1031. return get_outline_level(parent_id, visited)
  1032. return None
  1033. # Fill in missing outlineLvl from inheritance chain
  1034. all_style_ids = set(styles_outline.keys()) | set(style_based_on.keys())
  1035. for style_id in all_style_ids:
  1036. if style_id not in styles_outline:
  1037. level = get_outline_level(style_id)
  1038. if level is not None:
  1039. styles_outline[style_id] = level
  1040. except Exception:
  1041. # Silently ignore parsing errors
  1042. pass
  1043. return styles_outline
  1044. def get_heading_level(para_element, styles_outline_map: dict) -> int:
  1045. """
  1046. Get heading level from paragraph, checking both direct format and style.
  1047. Priority: paragraph outlineLvl > style outlineLvl
  1048. Args:
  1049. para_element: lxml paragraph element
  1050. styles_outline_map: dict of styleId -> outlineLvl from styles.xml
  1051. Returns:
  1052. int: 0-8 for heading levels (0=level 1, 1=level 2, etc.), None for non-heading
  1053. """
  1054. # 1. Check paragraph direct format
  1055. pPr = para_element.find(
  1056. "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pPr"
  1057. )
  1058. if pPr is not None:
  1059. outline_elem = pPr.find(
  1060. "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}outlineLvl"
  1061. )
  1062. if outline_elem is not None:
  1063. level = int(
  1064. outline_elem.get(
  1065. "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
  1066. )
  1067. )
  1068. # Only 0-8 are true heading levels (9 is body text)
  1069. if level < 9:
  1070. return level
  1071. else:
  1072. return None # Level 9 is body text
  1073. # 2. Check style definition's outlineLvl
  1074. if pPr is not None:
  1075. pStyle_elem = pPr.find(
  1076. "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pStyle"
  1077. )
  1078. if pStyle_elem is not None:
  1079. style_id = pStyle_elem.get(
  1080. "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
  1081. )
  1082. if style_id and style_id in styles_outline_map:
  1083. level = styles_outline_map[style_id]
  1084. if level < 9:
  1085. return level
  1086. else:
  1087. return None
  1088. return None
  1089. def extract_text_from_run(
  1090. run,
  1091. ns: dict,
  1092. drawing_context: DrawingExtractionContext = None,
  1093. ) -> str:
  1094. """
  1095. Extract text from a run element, preserving superscript/subscript with markup.
  1096. Converts Word formatting to HTML-like tags:
  1097. - Superscript: <sup>text</sup>
  1098. - Subscript: <sub>text</sub>
  1099. - Normal text: unchanged
  1100. Args:
  1101. run: lxml run element (w:r)
  1102. ns: XML namespace dictionary
  1103. Returns:
  1104. Text string with <sup>/<sub> markup for formatted portions
  1105. """
  1106. text = ""
  1107. # Check for vertAlign in rPr (superscript/subscript)
  1108. vert_align = None
  1109. rPr = run.find("w:rPr", ns)
  1110. if rPr is not None:
  1111. vert_elem = rPr.find("w:vertAlign", ns)
  1112. if vert_elem is not None:
  1113. vert_align = vert_elem.get(
  1114. "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
  1115. )
  1116. # Extract text content from run children
  1117. for child in run:
  1118. tag = child.tag.split("}")[-1] # Remove namespace
  1119. if tag == "t" and child.text:
  1120. text += child.text
  1121. elif tag == "tab":
  1122. text += "\t"
  1123. elif tag == "br":
  1124. # Handle line breaks - textWrapping or no type = soft line break
  1125. br_type = child.get(
  1126. "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type"
  1127. )
  1128. if br_type in (None, "textWrapping"):
  1129. text += "\n"
  1130. # Skip page and column breaks (layout elements)
  1131. elif tag == "drawing":
  1132. text += extract_drawing_placeholder_from_element(
  1133. child,
  1134. context=drawing_context,
  1135. include_extended_attrs=True,
  1136. )
  1137. elif tag in ("pict", "object"):
  1138. text += extract_vml_image_placeholder_from_element(
  1139. child,
  1140. context=drawing_context,
  1141. include_extended_attrs=True,
  1142. )
  1143. # Apply superscript/subscript markup if needed
  1144. if text and vert_align == "superscript":
  1145. return f"<sup>{text}</sup>"
  1146. elif text and vert_align == "subscript":
  1147. return f"<sub>{text}</sub>"
  1148. return text
  1149. def extract_paragraph_content(
  1150. element,
  1151. ns,
  1152. drawing_context: DrawingExtractionContext = None,
  1153. ) -> str:
  1154. """
  1155. Extract text and equations from a paragraph element in document order.
  1156. Handles w:r (text runs), m:oMath (inline equations), and m:oMathPara
  1157. (block equations). Recurses into container elements (e.g., w:hyperlink,
  1158. w:ins, w:sdt, w:fldSimple, w:smartTag) to avoid dropping content.
  1159. Args:
  1160. element: lxml paragraph element (w:p)
  1161. ns: XML namespace dictionary
  1162. Returns:
  1163. Text string with equations wrapped in <equation> tags
  1164. """
  1165. parts = []
  1166. def append_from(node) -> None:
  1167. tag = node.tag.split("}")[-1]
  1168. # Drop tracked-change deletions (w:del/w:moveFrom) and comment markers
  1169. # (w:commentRangeStart/End, w:commentReference, w:annotationRef) so the
  1170. # output only contains the final revised text without annotation glyphs.
  1171. if tag in _SKIP_PARAGRAPH_TAGS:
  1172. return
  1173. if tag == "r":
  1174. parts.append(
  1175. extract_text_from_run(node, ns, drawing_context=drawing_context)
  1176. )
  1177. return
  1178. if tag == "oMath":
  1179. from .omml import convert_omml_to_latex
  1180. latex = convert_omml_to_latex(node)
  1181. if latex:
  1182. parts.append(f"<equation>{latex}</equation>")
  1183. return
  1184. if tag == "oMathPara":
  1185. from .omml import convert_omml_to_latex
  1186. for omath in node:
  1187. if omath.tag.split("}")[-1] == "oMath":
  1188. latex = convert_omml_to_latex(omath)
  1189. if latex:
  1190. parts.append(f"<equation>{latex}</equation>")
  1191. return
  1192. for child in node:
  1193. append_from(child)
  1194. for child in element:
  1195. append_from(child)
  1196. return "".join(parts)
  1197. def _is_table_empty(rows: list) -> bool:
  1198. """Return True iff every cell in ``rows`` is whitespace-only."""
  1199. return all(not (cell or "").strip() for row in rows for cell in row)
  1200. def _collect_table_headers(paragraphs: list) -> list:
  1201. """Collect per-table cross-page header rows from ``is_table`` paragraphs.
  1202. The returned list is aligned 1:1 with the order of ``<table>`` placeholder
  1203. tags emitted into the block's content; entries are either the list of
  1204. header rows captured from ``w:tblHeader`` or ``None`` when the table has
  1205. no cross-page repeating header.
  1206. """
  1207. return [p.get("_table_header") for p in paragraphs if p.get("is_table")]
  1208. def _build_unsplit_block(
  1209. heading: str, paragraphs: list, parent_headings: list, level: int
  1210. ) -> dict:
  1211. """Build a single block from paragraphs without size-based splitting."""
  1212. last_para = paragraphs[-1]
  1213. block = {
  1214. "uuid": paragraphs[0]["para_id"],
  1215. "uuid_end": last_para.get("para_id_end") or last_para.get("para_id"),
  1216. "heading": heading,
  1217. "content": "\n".join(p["text"] for p in paragraphs),
  1218. "type": "text",
  1219. "parent_headings": parent_headings,
  1220. "level": level,
  1221. }
  1222. table_headers = _collect_table_headers(paragraphs)
  1223. if table_headers:
  1224. block["table_headers"] = table_headers
  1225. return block
  1226. def _flush_current_block(
  1227. blocks: list,
  1228. heading: str,
  1229. paragraphs: list,
  1230. parent_headings: list,
  1231. level: int,
  1232. fixlevel: int,
  1233. debug: bool,
  1234. ) -> None:
  1235. """
  1236. Flush accumulated paragraphs into blocks, respecting fixlevel mode.
  1237. In default mode (fixlevel is None), runs split_long_block for token-based splitting.
  1238. In fixlevel mode, emits a single unsplit block and warns when size exceeds the limit.
  1239. """
  1240. if not paragraphs:
  1241. return
  1242. if fixlevel is None:
  1243. blocks.extend(
  1244. split_long_block(heading, paragraphs, parent_headings, level, debug)
  1245. )
  1246. return
  1247. block = _build_unsplit_block(heading, paragraphs, parent_headings, level)
  1248. block_tokens = estimate_tokens(block["content"])
  1249. if block_tokens > MAX_BLOCK_CONTENT_TOKENS:
  1250. preview = heading[:80] + "..." if len(heading) > 80 else heading
  1251. print(
  1252. f"Warning: fixlevel block exceeds {MAX_BLOCK_CONTENT_TOKENS} tokens "
  1253. f'(~{block_tokens} tokens) under heading "{preview}". '
  1254. f"Consider increasing --fixlevel=N or removing --fixlevel for automatic splitting.",
  1255. file=sys.stderr,
  1256. )
  1257. blocks.append(block)
  1258. def extract_docx_blocks(
  1259. file_path: str,
  1260. debug: bool = False,
  1261. fixlevel: int = None,
  1262. drawing_context: DrawingExtractionContext = None,
  1263. parse_warnings: dict | None = None,
  1264. parse_metadata: dict | None = None,
  1265. ) -> list:
  1266. """
  1267. Extract text blocks (chunks) from a DOCX file for chunking later.
  1268. Uses python-docx with custom numbering resolver to:
  1269. 1. Capture automatic numbering (list labels)
  1270. 2. Split document by headings
  1271. 3. Convert tables to JSON (2D array)
  1272. 4. Validate heading lengths and table sizes
  1273. 5. Split long blocks using anchor paragraphs
  1274. 6. Preserve superscript/subscript formatting with <sup>/<sub> markup
  1275. Args:
  1276. file_path: Path to the DOCX file
  1277. debug: If True, output debug information when splitting blocks
  1278. fixlevel: If specified, disable smart splitting/merging and only split at heading levels <= fixlevel
  1279. (0 = split at all heading levels, 1 = Heading 1 only, 2 = Heading 1-2, etc.)
  1280. parse_warnings: Optional out-dict that this function mutates with
  1281. non-fatal warnings observed during parsing. Currently used for
  1282. ``missing_paraid_count`` — incremented once per body-level
  1283. paragraph (heading or text) that lacks a ``w14:paraId`` and once
  1284. per table whose every cell lacks one. Callers (the LightRAG
  1285. adapter / debug CLI) read this to surface a one-line warning per
  1286. document instead of crashing.
  1287. parse_metadata: Optional out-dict that this function mutates with
  1288. document-level metadata derived during parsing. Currently used
  1289. for ``first_heading`` — the text of the first heading encountered
  1290. in document order (regardless of level). Used by the LightRAG
  1291. adapter to populate ``meta.doc_title`` in ``.blocks.jsonl``.
  1292. Returns:
  1293. List of block dictionaries with heading, content, type, and metadata
  1294. """
  1295. doc = Document(file_path)
  1296. resolver = NumberingResolver(file_path)
  1297. styles_outline = parse_styles_outline_levels(file_path)
  1298. blocks = []
  1299. current_heading = "Preface/Uncategorized"
  1300. current_heading_level = 1 # Default level for "Preface/Uncategorized"
  1301. current_heading_stack = {} # {level: heading_text} - Use dict to correctly track heading hierarchy
  1302. current_parent_headings = [] # Parent headings for current block
  1303. current_paragraphs = [] # Track paragraphs with metadata for splitting
  1304. has_body_content = (
  1305. False # Track if current block has body content (non-heading paragraphs/tables)
  1306. )
  1307. matched_fixlevel_heading = False # Track whether --fixlevel matched any heading
  1308. table_split_counter = (
  1309. 0 # Track cumulative table split suffix numbers within current block
  1310. )
  1311. first_heading_recorded = (
  1312. False # Track whether the document's first heading has been captured
  1313. )
  1314. # Iterate through document body elements (paragraphs and tables)
  1315. body = doc._element.body
  1316. for element in body:
  1317. tag = element.tag.split("}")[-1] # Remove namespace
  1318. if tag == "sectPr": # Document-level section break
  1319. resolver.reset_tracking_state()
  1320. continue
  1321. if tag == "p": # Paragraph
  1322. # Get paragraph text with superscript/subscript markup and equations
  1323. para_text = ""
  1324. ns = {
  1325. "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
  1326. "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
  1327. "m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
  1328. }
  1329. para_text = extract_paragraph_content(
  1330. element,
  1331. ns,
  1332. drawing_context=drawing_context,
  1333. )
  1334. para_text = para_text.strip()
  1335. if not para_text:
  1336. continue
  1337. # Get numbering label using our resolver
  1338. label = resolver.get_label(element)
  1339. full_text = f"{label} {para_text}".strip() if label else para_text
  1340. # Check if this is a heading using the new function
  1341. outline_level = get_heading_level(element, styles_outline)
  1342. if outline_level is not None:
  1343. # This is a heading (outline level 0-8)
  1344. # Convert 0-based to 1-based level
  1345. level = outline_level + 1
  1346. # In fixlevel mode, check if this heading should trigger a block split
  1347. should_split = True
  1348. if fixlevel is not None and fixlevel > 0:
  1349. # If fixlevel is specified and > 0, only split at levels <= fixlevel
  1350. should_split = level <= fixlevel
  1351. # Extract paraId for this heading
  1352. heading_para_id = extract_para_id(element)
  1353. if parse_warnings is not None and not heading_para_id:
  1354. parse_warnings["missing_paraid_count"] = (
  1355. parse_warnings.get("missing_paraid_count", 0) + 1
  1356. )
  1357. # Validate heading length
  1358. validate_heading_length(full_text, heading_para_id)
  1359. # Truncate heading if needed before storing
  1360. truncated_text = truncate_heading(full_text, heading_para_id)
  1361. # Record the document's first heading (any level) for meta.doc_title.
  1362. if not first_heading_recorded:
  1363. if parse_metadata is not None:
  1364. parse_metadata["first_heading"] = truncated_text
  1365. first_heading_recorded = True
  1366. if should_split:
  1367. if fixlevel is not None and fixlevel > 0:
  1368. matched_fixlevel_heading = True
  1369. # This heading triggers a block split
  1370. # Only save previous block if it has body content
  1371. if has_body_content and current_paragraphs:
  1372. _flush_current_block(
  1373. blocks,
  1374. current_heading,
  1375. current_paragraphs,
  1376. current_parent_headings,
  1377. current_heading_level,
  1378. fixlevel,
  1379. debug,
  1380. )
  1381. # Reset for new block
  1382. current_paragraphs = []
  1383. has_body_content = False
  1384. table_split_counter = (
  1385. 0 # Reset table split counter for new heading
  1386. )
  1387. # Add heading to current_paragraphs
  1388. current_paragraphs.append(
  1389. {
  1390. "text": truncated_text,
  1391. "para_id": heading_para_id,
  1392. "is_table": False,
  1393. }
  1394. )
  1395. # Update current_heading and parent_headings for the FIRST heading in a block
  1396. # (when current_paragraphs just had this heading added as its first element)
  1397. if len(current_paragraphs) == 1:
  1398. current_heading = truncated_text
  1399. current_heading_level = (
  1400. level # Only set level when setting heading
  1401. )
  1402. # Parent headings = all headings from levels strictly less than current level
  1403. # Sort by level to maintain hierarchy order
  1404. current_parent_headings = [
  1405. current_heading_stack[lvl]
  1406. for lvl in sorted(current_heading_stack.keys())
  1407. if lvl < level
  1408. ]
  1409. # Update heading stack: remove current level and all lower levels, then add current
  1410. current_heading_stack = {
  1411. k: v for k, v in current_heading_stack.items() if k < level
  1412. }
  1413. current_heading_stack[level] = truncated_text
  1414. else:
  1415. # This heading doesn't trigger split - treat as regular paragraph
  1416. para_id = heading_para_id
  1417. # Store as regular paragraph with metadata
  1418. current_paragraphs.append(
  1419. {"text": truncated_text, "para_id": para_id, "is_table": False}
  1420. )
  1421. # Mark that we have body content
  1422. has_body_content = True
  1423. else:
  1424. # Regular paragraph content
  1425. para_id = extract_para_id(element)
  1426. if parse_warnings is not None and not para_id:
  1427. parse_warnings["missing_paraid_count"] = (
  1428. parse_warnings.get("missing_paraid_count", 0) + 1
  1429. )
  1430. # Store paragraph with metadata for potential splitting
  1431. current_paragraphs.append(
  1432. {"text": full_text, "para_id": para_id, "is_table": False}
  1433. )
  1434. # Mark that we have body content
  1435. has_body_content = True
  1436. # Check for paragraph-level section break (after processing paragraph)
  1437. # sectPr in pPr means this paragraph ends a section
  1438. pPr = element.find(
  1439. "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pPr"
  1440. )
  1441. if pPr is not None:
  1442. sectPr = pPr.find(
  1443. "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}sectPr"
  1444. )
  1445. if sectPr is not None:
  1446. # Section break after this paragraph - reset tracking
  1447. resolver.reset_tracking_state()
  1448. elif tag == "tbl": # Table
  1449. # Reset numbering tracking before table (table start boundary)
  1450. resolver.reset_tracking_state()
  1451. # Directly create Table object from XML element to avoid index mismatch
  1452. # (doc.tables may have different order due to nested tables)
  1453. from docx.table import Table
  1454. table = Table(element, doc)
  1455. table_metadata = TableExtractor.extract_with_metadata(
  1456. table,
  1457. numbering_resolver=resolver,
  1458. drawing_context=drawing_context,
  1459. )
  1460. table_rows = table_metadata["rows"]
  1461. para_ids = table_metadata["para_ids"]
  1462. para_ids_end = table_metadata["para_ids_end"] # Last paraId in each cell
  1463. header_indices = table_metadata["header_indices"]
  1464. # Skip tables whose every cell is whitespace-only — otherwise an
  1465. # empty `<table>[[""]]</table>` placeholder would leak into block
  1466. # content and a useless IRTable would appear in tables.json.
  1467. if _is_table_empty(table_rows):
  1468. resolver.reset_tracking_state()
  1469. continue
  1470. # Count tables whose cells carry no w14:paraId. Legacy / non-Word
  1471. # docx authors omit these attributes; we no longer fail-fast, but
  1472. # the adapter surfaces a single warning so the user knows the edit
  1473. # range hints will be missing for these tables.
  1474. if parse_warnings is not None and not _table_has_any_paraid(para_ids):
  1475. parse_warnings["missing_paraid_count"] = (
  1476. parse_warnings.get("missing_paraid_count", 0) + 1
  1477. )
  1478. # Convert table to JSON and estimate token count
  1479. table_json = json.dumps(table_rows, ensure_ascii=False)
  1480. table_tokens = estimate_tokens(table_json)
  1481. # Extract cross-page repeating header rows (w:tblHeader) once per
  1482. # table so both split and unsplit branches can surface them to the
  1483. # sidecar via the block-level ``table_headers`` list.
  1484. header_rows = []
  1485. if header_indices:
  1486. header_rows = [
  1487. table_rows[idx] for idx in header_indices if idx < len(table_rows)
  1488. ]
  1489. header_rows_or_none = header_rows if header_rows else None
  1490. # Check if table needs splitting (disabled in fixlevel mode)
  1491. if fixlevel is None and table_tokens > TABLE_MAX_TOKENS:
  1492. # Table exceeds limit - split it
  1493. # Pass table_split_counter to ensure sequential numbering across multiple tables
  1494. table_chunks = split_table_with_heading(
  1495. table_rows,
  1496. para_ids,
  1497. para_ids_end,
  1498. header_indices,
  1499. current_heading,
  1500. table_split_counter,
  1501. debug,
  1502. )
  1503. for chunk_idx, chunk in enumerate(table_chunks):
  1504. chunk_json = json.dumps(chunk["rows"], ensure_ascii=False)
  1505. # Get uuid_end from last valid paraId in chunk (use para_ids_end for last cell's last paragraph)
  1506. chunk_para_id_end = find_last_valid_para_id(chunk["para_ids_end"])
  1507. if chunk["is_first"]:
  1508. # First chunk: add to current_paragraphs (will merge with preceding content)
  1509. current_paragraphs.append(
  1510. {
  1511. "text": f"<table>{chunk_json}</table>",
  1512. "para_id": chunk["uuid"],
  1513. "para_id_end": chunk_para_id_end, # Store end paraId for uuid_end calculation
  1514. "is_table": True,
  1515. "_table_header": header_rows_or_none,
  1516. }
  1517. )
  1518. has_body_content = True
  1519. else:
  1520. # Middle or last chunk: save current block first
  1521. if current_paragraphs:
  1522. _flush_current_block(
  1523. blocks,
  1524. current_heading,
  1525. current_paragraphs,
  1526. current_parent_headings,
  1527. current_heading_level,
  1528. fixlevel,
  1529. debug,
  1530. )
  1531. current_paragraphs = []
  1532. has_body_content = False
  1533. # Generate heading using suffix_number from chunk
  1534. if chunk["suffix_number"] is not None:
  1535. chunk_heading = f"{current_heading} [{TABLE_CHUNK_SUFFIX_LABEL}{chunk['suffix_number']}]"
  1536. else:
  1537. chunk_heading = current_heading
  1538. # Build block for this table chunk
  1539. # Get uuid_end from last valid paraId in chunk (use para_ids_end for last cell's last paragraph)
  1540. chunk_uuid_end = find_last_valid_para_id(chunk["para_ids_end"])
  1541. # Determine table_chunk_role based on chunk position
  1542. if chunk["is_first"] and chunk["is_last"]:
  1543. table_chunk_role = "none" # Not split
  1544. elif chunk["is_first"]:
  1545. table_chunk_role = "first"
  1546. elif chunk["is_last"]:
  1547. table_chunk_role = "last"
  1548. else:
  1549. table_chunk_role = "middle"
  1550. chunk_block = {
  1551. "uuid": chunk["uuid"],
  1552. "uuid_end": chunk_uuid_end,
  1553. "heading": chunk_heading,
  1554. "content": f"<table>{chunk_json}</table>",
  1555. "type": "text",
  1556. "parent_headings": current_parent_headings,
  1557. "level": current_heading_level,
  1558. "table_chunk_role": table_chunk_role,
  1559. }
  1560. # Always emit a per-table headers list (aligned with the
  1561. # single <table> placeholder in this standalone block);
  1562. # the entry is None when the table has no cross-page
  1563. # repeating header so downstream counters stay aligned.
  1564. chunk_block["table_headers"] = [header_rows_or_none]
  1565. if chunk["is_last"]:
  1566. # Last chunk: add to current_paragraphs for merging with following content
  1567. current_paragraphs.append(
  1568. {
  1569. "text": f"<table>{chunk_json}</table>",
  1570. "para_id": chunk["uuid"],
  1571. "para_id_end": chunk_para_id_end, # Store end paraId for uuid_end calculation
  1572. "is_table": True,
  1573. "_chunk_heading": chunk_heading,
  1574. "_table_header": header_rows_or_none,
  1575. }
  1576. )
  1577. has_body_content = True
  1578. else:
  1579. # Middle chunk: output immediately as standalone block
  1580. blocks.append(chunk_block)
  1581. # Update table_split_counter: add number of non-first chunks
  1582. # (first chunk doesn't get a suffix, so we count from second chunk onwards)
  1583. table_split_counter += len(table_chunks) - 1
  1584. else:
  1585. # Table is within size limit - no splitting needed
  1586. # Store table as a paragraph with special marker
  1587. # Use first valid paraId from table, and last valid paraId (from para_ids_end) for uuid_end
  1588. table_para_id = find_first_valid_para_id(para_ids)
  1589. table_para_id_end = find_last_valid_para_id(para_ids_end)
  1590. current_paragraphs.append(
  1591. {
  1592. "text": f"<table>{table_json}</table>",
  1593. "para_id": table_para_id,
  1594. "para_id_end": table_para_id_end, # Store end paraId for uuid_end calculation
  1595. "is_table": True,
  1596. "_table_header": header_rows_or_none,
  1597. }
  1598. )
  1599. # Mark that we have body content
  1600. has_body_content = True
  1601. # Reset numbering tracking after table (table end boundary)
  1602. resolver.reset_tracking_state()
  1603. # Save final block (respecting fixlevel mode)
  1604. _flush_current_block(
  1605. blocks,
  1606. current_heading,
  1607. current_paragraphs,
  1608. current_parent_headings,
  1609. current_heading_level,
  1610. fixlevel,
  1611. debug,
  1612. )
  1613. # Add table_chunk_role="none" to all blocks that don't have it (non-table or unsplit table blocks)
  1614. for block in blocks:
  1615. if "table_chunk_role" not in block:
  1616. block["table_chunk_role"] = "none"
  1617. # Perform small block merging (unified merging after all splits)
  1618. # Disabled in fixlevel mode
  1619. if fixlevel is None:
  1620. if debug:
  1621. print(f"\n[DEBUG] Before merging: {len(blocks)} blocks", file=sys.stderr)
  1622. merged_blocks, merge_count = merge_small_blocks(blocks, debug)
  1623. if debug and merge_count > 0:
  1624. print(
  1625. f"[DEBUG] After merging: {len(merged_blocks)} blocks ({merge_count} merges performed)",
  1626. file=sys.stderr,
  1627. )
  1628. return merged_blocks
  1629. # Fixed level mode: skip merging, but warn if no heading matched the requested level
  1630. if fixlevel > 0 and not matched_fixlevel_heading:
  1631. print(
  1632. f"Warning: --fixlevel={fixlevel} produced {len(blocks)} block(s). "
  1633. f"Document may not have heading levels <= {fixlevel}. "
  1634. f"Try a higher --fixlevel value or remove the flag.",
  1635. file=sys.stderr,
  1636. )
  1637. return blocks