table_extractor.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. #!/usr/bin/env python3
  2. """
  3. ABOUTME: Extracts tables from DOCX with proper merged cell handling
  4. ABOUTME: Vertically merged cells: content repeated in all rows with shared paraId
  5. ABOUTME: Horizontally merged cells: content in first cell only
  6. ABOUTME: Preserves superscript/subscript formatting with <sup>/<sub> markup
  7. """
  8. from docx.table import Table
  9. from docx.oxml.ns import qn
  10. from typing import List
  11. from .drawing_image_extractor import (
  12. DrawingExtractionContext,
  13. extract_drawing_placeholder_from_element,
  14. extract_vml_image_placeholder_from_element,
  15. )
  16. # Keep in sync with parse_document._SKIP_PARAGRAPH_TAGS — duplicated here to
  17. # avoid a circular import between parse_document and table_extractor.
  18. _SKIP_PARAGRAPH_TAGS = frozenset(
  19. {
  20. "del",
  21. "moveFrom",
  22. "commentRangeStart",
  23. "commentRangeEnd",
  24. "commentReference",
  25. "annotationRef",
  26. }
  27. )
  28. def extract_text_from_run_table(
  29. run_elem,
  30. qn_func,
  31. drawing_context: DrawingExtractionContext = None,
  32. ) -> str:
  33. """
  34. Extract text from a run element in table cell, preserving superscript/subscript with markup.
  35. Converts Word formatting to HTML-like tags:
  36. - Superscript: <sup>text</sup>
  37. - Subscript: <sub>text</sub>
  38. - Normal text: unchanged
  39. Args:
  40. run_elem: lxml run element (w:r)
  41. qn_func: qn function for namespace handling
  42. Returns:
  43. Text string with <sup>/<sub> markup for formatted portions
  44. """
  45. text = ""
  46. # Check for vertAlign in rPr (superscript/subscript)
  47. vert_align = None
  48. rPr = run_elem.find(qn_func("w:rPr"))
  49. if rPr is not None:
  50. vert_elem = rPr.find(qn_func("w:vertAlign"))
  51. if vert_elem is not None:
  52. vert_align = vert_elem.get(qn_func("w:val"))
  53. # Extract text content from run children
  54. for child in run_elem:
  55. tag = child.tag.split("}")[-1] # Remove namespace
  56. if tag == "t" and child.text:
  57. text += child.text
  58. elif tag == "tab":
  59. text += "\t"
  60. elif tag == "br":
  61. # Handle line breaks - textWrapping or no type = soft line break
  62. br_type = child.get(qn_func("w:type"))
  63. if br_type in (None, "textWrapping"):
  64. text += "\n"
  65. # Skip page and column breaks (layout elements)
  66. elif tag == "drawing":
  67. text += extract_drawing_placeholder_from_element(
  68. child,
  69. context=drawing_context,
  70. include_extended_attrs=True,
  71. )
  72. elif tag in ("pict", "object"):
  73. text += extract_vml_image_placeholder_from_element(
  74. child,
  75. context=drawing_context,
  76. include_extended_attrs=True,
  77. )
  78. # Apply superscript/subscript markup if needed
  79. if text and vert_align == "superscript":
  80. return f"<sup>{text}</sup>"
  81. elif text and vert_align == "subscript":
  82. return f"<sub>{text}</sub>"
  83. return text
  84. def extract_paragraph_content_table(
  85. para_elem,
  86. qn_func,
  87. drawing_context: DrawingExtractionContext = None,
  88. ) -> str:
  89. """
  90. Extract text and equations from a table cell paragraph in document order.
  91. Handles w:r (text runs), m:oMath (inline equations), and m:oMathPara
  92. (block equations). Recurses into container elements (e.g., w:hyperlink,
  93. w:ins, w:sdt, w:fldSimple, w:smartTag) to avoid dropping content.
  94. Args:
  95. para_elem: lxml paragraph element (w:p)
  96. qn_func: qn function for namespace handling
  97. Returns:
  98. Text string with equations wrapped in <equation> tags
  99. """
  100. parts = []
  101. def append_from(node) -> None:
  102. tag = node.tag.split("}")[-1]
  103. # Drop tracked-change deletions (w:del/w:moveFrom) and comment markers
  104. # (w:commentRangeStart/End, w:commentReference, w:annotationRef) so the
  105. # output only contains the final revised text without annotation glyphs.
  106. if tag in _SKIP_PARAGRAPH_TAGS:
  107. return
  108. if tag == "r":
  109. parts.append(
  110. extract_text_from_run_table(
  111. node,
  112. qn_func,
  113. drawing_context=drawing_context,
  114. )
  115. )
  116. return
  117. if tag == "oMath":
  118. from omml import convert_omml_to_latex
  119. latex = convert_omml_to_latex(node)
  120. if latex:
  121. parts.append(f"<equation>{latex}</equation>")
  122. return
  123. if tag == "oMathPara":
  124. from omml import convert_omml_to_latex
  125. for omath in node:
  126. if omath.tag.split("}")[-1] == "oMath":
  127. latex = convert_omml_to_latex(omath)
  128. if latex:
  129. parts.append(f"<equation>{latex}</equation>")
  130. return
  131. for child in node:
  132. append_from(child)
  133. for child in para_elem:
  134. append_from(child)
  135. return "".join(parts)
  136. class TableExtractor:
  137. """
  138. Extract table content handling merged cells correctly.
  139. Merged cells in DOCX:
  140. - Horizontal: w:gridSpan specifies how many columns cell spans
  141. - Vertical: w:vMerge with val="restart" starts merge, subsequent cells continue
  142. Output format:
  143. - 2D list of strings
  144. - Vertically merged cells: content repeated in all rows, all rows use the same paraId (from start cell)
  145. - Horizontally merged cells: content in left-most position only, other positions empty
  146. """
  147. @staticmethod
  148. def extract(
  149. table: Table,
  150. numbering_resolver=None,
  151. drawing_context: DrawingExtractionContext = None,
  152. ) -> List[List[str]]:
  153. """
  154. Extract table to 2D string array.
  155. Args:
  156. table: python-docx Table object
  157. numbering_resolver: Optional NumberingResolver for extracting numbering
  158. Returns:
  159. List of rows, each row is list of cell text strings
  160. """
  161. result = TableExtractor.extract_with_metadata(
  162. table,
  163. numbering_resolver=numbering_resolver,
  164. drawing_context=drawing_context,
  165. )
  166. return result["rows"]
  167. @staticmethod
  168. def extract_with_metadata(
  169. table: Table,
  170. numbering_resolver=None,
  171. drawing_context: DrawingExtractionContext = None,
  172. ) -> dict:
  173. """
  174. Extract table to 2D string array with metadata (paraIds, header info).
  175. Vertical merge behavior:
  176. - All rows in a vertically merged region share the same content
  177. - All rows use the paraId from the merge start cell (for precise edit targeting)
  178. Args:
  179. table: python-docx Table object
  180. numbering_resolver: Optional NumberingResolver for extracting numbering
  181. Returns:
  182. Dict with:
  183. - rows: 2D list of cell text strings
  184. - para_ids: 2D list of paraIds (first paraId in each cell, or None)
  185. For vertically merged cells, all rows share the start cell's paraId
  186. - para_ids_end: 2D list of paraIds (last paraId in each cell, or None)
  187. For vertically merged cells, all rows share the start cell's paraId
  188. - header_indices: List of row indices marked as table headers
  189. """
  190. tbl = table._tbl
  191. # Get number of columns from tblGrid
  192. tbl_grid = tbl.find(qn("w:tblGrid"))
  193. num_cols = 0
  194. if tbl_grid is not None:
  195. num_cols = len(tbl_grid.findall(qn("w:gridCol")))
  196. if num_cols == 0:
  197. return {
  198. "rows": [],
  199. "para_ids": [],
  200. "para_ids_end": [],
  201. "header_indices": [],
  202. }
  203. # Detect header rows using w:tblHeader attribute
  204. header_indices = []
  205. for idx, tr in enumerate(tbl.findall(qn("w:tr"))):
  206. trPr = tr.find(qn("w:trPr"))
  207. if trPr is not None:
  208. tbl_header = trPr.find(qn("w:tblHeader"))
  209. if tbl_header is not None:
  210. header_indices.append(idx)
  211. # Process each row by directly iterating <w:tr> elements
  212. grid = []
  213. para_ids_grid = []
  214. para_ids_end_grid = [] # Track last paraId in each cell
  215. vmerge_content = {} # Track vertical merge by column: {col: {'text': str, 'para_id': str, 'para_id_end': str}}
  216. for tr in tbl.findall(qn("w:tr")):
  217. row_data = [""] * num_cols # Pre-fill with empty strings
  218. row_para_ids = [None] * num_cols # Pre-fill with None
  219. row_para_ids_end = [None] * num_cols # Pre-fill with None for last paraId
  220. grid_col = 0
  221. # Iterate actual <w:tc> elements (each physical cell appears once)
  222. for tc in tr.findall(qn("w:tc")):
  223. # Reset numbering state when cell changes to prevent incorrect continuation
  224. if numbering_resolver is not None:
  225. numbering_resolver.reset_tracking_state()
  226. tcPr = tc.find(qn("w:tcPr"))
  227. # Check gridSpan (horizontal merge)
  228. grid_span = 1
  229. if tcPr is not None:
  230. gs = tcPr.find(qn("w:gridSpan"))
  231. if gs is not None:
  232. grid_span = int(gs.get(qn("w:val")))
  233. # Check vMerge (vertical merge)
  234. vmerge_elem = None
  235. vmerge_val = None
  236. if tcPr is not None:
  237. vmerge_elem = tcPr.find(qn("w:vMerge"))
  238. if vmerge_elem is not None:
  239. vmerge_val = vmerge_elem.get(
  240. qn("w:val")
  241. ) # 'restart' or None (means 'continue')
  242. # Determine vMerge status
  243. is_vmerge_restart = vmerge_elem is not None and vmerge_val == "restart"
  244. is_vmerge_continue = vmerge_elem is not None and vmerge_val in (
  245. None,
  246. "continue",
  247. )
  248. is_normal_cell = vmerge_elem is None
  249. cell_text = ""
  250. cell_para_id = None
  251. cell_para_id_end = None # Track last paraId in cell
  252. # Handle different vMerge cases
  253. if is_vmerge_restart or is_normal_cell:
  254. # Extract content for restart or normal cells
  255. # Get cell text with numbering support and format preservation
  256. if numbering_resolver is not None:
  257. # Extract text with numbering labels and superscript/subscript markup
  258. cell_paragraphs = []
  259. for para_elem in tc.findall(qn("w:p")):
  260. # Capture paraId from each paragraph
  261. para_id_attr = para_elem.get(
  262. "{http://schemas.microsoft.com/office/word/2010/wordml}paraId"
  263. )
  264. if para_id_attr:
  265. if cell_para_id is None:
  266. cell_para_id = para_id_attr # First paraId
  267. cell_para_id_end = (
  268. para_id_attr # Always update to get last
  269. )
  270. # Get text content with format preservation (superscript/subscript/equations)
  271. para_text = extract_paragraph_content_table(
  272. para_elem,
  273. qn,
  274. drawing_context=drawing_context,
  275. )
  276. # Get numbering label
  277. label = numbering_resolver.get_label(para_elem)
  278. # Combine label and text
  279. if label:
  280. full_text = f"{label} {para_text}".strip()
  281. else:
  282. full_text = para_text.strip()
  283. if full_text:
  284. cell_paragraphs.append(full_text)
  285. cell_text = "\n".join(cell_paragraphs).replace("\x07", "")
  286. else:
  287. # Fallback to simple text extraction with format preservation
  288. # Cannot use cell.text here, must extract from XML
  289. para_texts = []
  290. for para_elem in tc.findall(qn("w:p")):
  291. # Capture paraId from each paragraph
  292. para_id_attr = para_elem.get(
  293. "{http://schemas.microsoft.com/office/word/2010/wordml}paraId"
  294. )
  295. if para_id_attr:
  296. if cell_para_id is None:
  297. cell_para_id = para_id_attr # First paraId
  298. cell_para_id_end = (
  299. para_id_attr # Always update to get last
  300. )
  301. # Extract text with format preservation (superscript/subscript/equations)
  302. para_text = extract_paragraph_content_table(
  303. para_elem,
  304. qn,
  305. drawing_context=drawing_context,
  306. )
  307. if para_text:
  308. para_texts.append(para_text.strip())
  309. cell_text = "\n".join(para_texts).replace("\x07", "")
  310. # Store content and paraIds for vMerge restart
  311. if is_vmerge_restart:
  312. vmerge_content[grid_col] = {
  313. "text": cell_text,
  314. "para_id": cell_para_id,
  315. "para_id_end": cell_para_id_end,
  316. }
  317. elif is_normal_cell:
  318. # For normal cells: if empty and we have active vMerge, copy all from start
  319. # If non-empty, this ends the vMerge region
  320. if not cell_text and grid_col in vmerge_content:
  321. # Empty cell in vMerge region - copy content and paraIds from start
  322. cell_text = vmerge_content[grid_col]["text"]
  323. cell_para_id = vmerge_content[grid_col]["para_id"]
  324. cell_para_id_end = vmerge_content[grid_col]["para_id_end"]
  325. elif cell_text:
  326. # Non-empty cell - this ends the vMerge for this column
  327. vmerge_content.pop(grid_col, None)
  328. elif is_vmerge_continue:
  329. # Copy content and para_id from previous merge start
  330. # But extract actual para_id_end from this continue cell for range boundary
  331. if grid_col in vmerge_content:
  332. cell_text = vmerge_content[grid_col]["text"]
  333. cell_para_id = vmerge_content[grid_col][
  334. "para_id"
  335. ] # Use restart's paraId for edit targeting
  336. # Extract actual paraId from this continue cell for uuid_end (range boundary)
  337. for para_elem in tc.findall(qn("w:p")):
  338. para_id_attr = para_elem.get(
  339. "{http://schemas.microsoft.com/office/word/2010/wordml}paraId"
  340. )
  341. if para_id_attr:
  342. cell_para_id_end = (
  343. para_id_attr # Use actual paraId for range boundary
  344. )
  345. # Place content at starting grid position only
  346. if grid_col < num_cols:
  347. row_data[grid_col] = cell_text
  348. row_para_ids[grid_col] = cell_para_id
  349. row_para_ids_end[grid_col] = cell_para_id_end
  350. # Move grid position by gridSpan
  351. grid_col += grid_span
  352. grid.append(row_data)
  353. para_ids_grid.append(row_para_ids)
  354. para_ids_end_grid.append(row_para_ids_end)
  355. return {
  356. "rows": grid,
  357. "para_ids": para_ids_grid,
  358. "para_ids_end": para_ids_end_grid,
  359. "header_indices": header_indices,
  360. }