| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419 |
- #!/usr/bin/env python3
- """
- ABOUTME: Extracts tables from DOCX with proper merged cell handling
- ABOUTME: Vertically merged cells: content repeated in all rows with shared paraId
- ABOUTME: Horizontally merged cells: content in first cell only
- ABOUTME: Preserves superscript/subscript formatting with <sup>/<sub> markup
- """
- from docx.table import Table
- from docx.oxml.ns import qn
- from typing import List
- from .drawing_image_extractor import (
- DrawingExtractionContext,
- extract_drawing_placeholder_from_element,
- extract_vml_image_placeholder_from_element,
- )
- # Keep in sync with parse_document._SKIP_PARAGRAPH_TAGS — duplicated here to
- # avoid a circular import between parse_document and table_extractor.
- _SKIP_PARAGRAPH_TAGS = frozenset(
- {
- "del",
- "moveFrom",
- "commentRangeStart",
- "commentRangeEnd",
- "commentReference",
- "annotationRef",
- }
- )
- def extract_text_from_run_table(
- run_elem,
- qn_func,
- drawing_context: DrawingExtractionContext = None,
- ) -> str:
- """
- Extract text from a run element in table cell, preserving superscript/subscript with markup.
- Converts Word formatting to HTML-like tags:
- - Superscript: <sup>text</sup>
- - Subscript: <sub>text</sub>
- - Normal text: unchanged
- Args:
- run_elem: lxml run element (w:r)
- qn_func: qn function for namespace handling
- Returns:
- Text string with <sup>/<sub> markup for formatted portions
- """
- text = ""
- # Check for vertAlign in rPr (superscript/subscript)
- vert_align = None
- rPr = run_elem.find(qn_func("w:rPr"))
- if rPr is not None:
- vert_elem = rPr.find(qn_func("w:vertAlign"))
- if vert_elem is not None:
- vert_align = vert_elem.get(qn_func("w:val"))
- # Extract text content from run children
- for child in run_elem:
- tag = child.tag.split("}")[-1] # Remove namespace
- if tag == "t" and child.text:
- text += child.text
- elif tag == "tab":
- text += "\t"
- elif tag == "br":
- # Handle line breaks - textWrapping or no type = soft line break
- br_type = child.get(qn_func("w:type"))
- if br_type in (None, "textWrapping"):
- text += "\n"
- # Skip page and column breaks (layout elements)
- elif tag == "drawing":
- text += extract_drawing_placeholder_from_element(
- child,
- context=drawing_context,
- include_extended_attrs=True,
- )
- elif tag in ("pict", "object"):
- text += extract_vml_image_placeholder_from_element(
- child,
- context=drawing_context,
- include_extended_attrs=True,
- )
- # Apply superscript/subscript markup if needed
- if text and vert_align == "superscript":
- return f"<sup>{text}</sup>"
- elif text and vert_align == "subscript":
- return f"<sub>{text}</sub>"
- return text
- def extract_paragraph_content_table(
- para_elem,
- qn_func,
- drawing_context: DrawingExtractionContext = None,
- ) -> str:
- """
- Extract text and equations from a table cell paragraph in document order.
- Handles w:r (text runs), m:oMath (inline equations), and m:oMathPara
- (block equations). Recurses into container elements (e.g., w:hyperlink,
- w:ins, w:sdt, w:fldSimple, w:smartTag) to avoid dropping content.
- Args:
- para_elem: lxml paragraph element (w:p)
- qn_func: qn function for namespace handling
- Returns:
- Text string with equations wrapped in <equation> tags
- """
- parts = []
- def append_from(node) -> None:
- tag = node.tag.split("}")[-1]
- # Drop tracked-change deletions (w:del/w:moveFrom) and comment markers
- # (w:commentRangeStart/End, w:commentReference, w:annotationRef) so the
- # output only contains the final revised text without annotation glyphs.
- if tag in _SKIP_PARAGRAPH_TAGS:
- return
- if tag == "r":
- parts.append(
- extract_text_from_run_table(
- node,
- qn_func,
- drawing_context=drawing_context,
- )
- )
- return
- if tag == "oMath":
- from omml import convert_omml_to_latex
- latex = convert_omml_to_latex(node)
- if latex:
- parts.append(f"<equation>{latex}</equation>")
- return
- if tag == "oMathPara":
- from omml import convert_omml_to_latex
- for omath in node:
- if omath.tag.split("}")[-1] == "oMath":
- latex = convert_omml_to_latex(omath)
- if latex:
- parts.append(f"<equation>{latex}</equation>")
- return
- for child in node:
- append_from(child)
- for child in para_elem:
- append_from(child)
- return "".join(parts)
- class TableExtractor:
- """
- Extract table content handling merged cells correctly.
- Merged cells in DOCX:
- - Horizontal: w:gridSpan specifies how many columns cell spans
- - Vertical: w:vMerge with val="restart" starts merge, subsequent cells continue
- Output format:
- - 2D list of strings
- - Vertically merged cells: content repeated in all rows, all rows use the same paraId (from start cell)
- - Horizontally merged cells: content in left-most position only, other positions empty
- """
- @staticmethod
- def extract(
- table: Table,
- numbering_resolver=None,
- drawing_context: DrawingExtractionContext = None,
- ) -> List[List[str]]:
- """
- Extract table to 2D string array.
- Args:
- table: python-docx Table object
- numbering_resolver: Optional NumberingResolver for extracting numbering
- Returns:
- List of rows, each row is list of cell text strings
- """
- result = TableExtractor.extract_with_metadata(
- table,
- numbering_resolver=numbering_resolver,
- drawing_context=drawing_context,
- )
- return result["rows"]
- @staticmethod
- def extract_with_metadata(
- table: Table,
- numbering_resolver=None,
- drawing_context: DrawingExtractionContext = None,
- ) -> dict:
- """
- Extract table to 2D string array with metadata (paraIds, header info).
- Vertical merge behavior:
- - All rows in a vertically merged region share the same content
- - All rows use the paraId from the merge start cell (for precise edit targeting)
- Args:
- table: python-docx Table object
- numbering_resolver: Optional NumberingResolver for extracting numbering
- Returns:
- Dict with:
- - rows: 2D list of cell text strings
- - para_ids: 2D list of paraIds (first paraId in each cell, or None)
- For vertically merged cells, all rows share the start cell's paraId
- - para_ids_end: 2D list of paraIds (last paraId in each cell, or None)
- For vertically merged cells, all rows share the start cell's paraId
- - header_indices: List of row indices marked as table headers
- """
- tbl = table._tbl
- # Get number of columns from tblGrid
- tbl_grid = tbl.find(qn("w:tblGrid"))
- num_cols = 0
- if tbl_grid is not None:
- num_cols = len(tbl_grid.findall(qn("w:gridCol")))
- if num_cols == 0:
- return {
- "rows": [],
- "para_ids": [],
- "para_ids_end": [],
- "header_indices": [],
- }
- # Detect header rows using w:tblHeader attribute
- header_indices = []
- for idx, tr in enumerate(tbl.findall(qn("w:tr"))):
- trPr = tr.find(qn("w:trPr"))
- if trPr is not None:
- tbl_header = trPr.find(qn("w:tblHeader"))
- if tbl_header is not None:
- header_indices.append(idx)
- # Process each row by directly iterating <w:tr> elements
- grid = []
- para_ids_grid = []
- para_ids_end_grid = [] # Track last paraId in each cell
- vmerge_content = {} # Track vertical merge by column: {col: {'text': str, 'para_id': str, 'para_id_end': str}}
- for tr in tbl.findall(qn("w:tr")):
- row_data = [""] * num_cols # Pre-fill with empty strings
- row_para_ids = [None] * num_cols # Pre-fill with None
- row_para_ids_end = [None] * num_cols # Pre-fill with None for last paraId
- grid_col = 0
- # Iterate actual <w:tc> elements (each physical cell appears once)
- for tc in tr.findall(qn("w:tc")):
- # Reset numbering state when cell changes to prevent incorrect continuation
- if numbering_resolver is not None:
- numbering_resolver.reset_tracking_state()
- tcPr = tc.find(qn("w:tcPr"))
- # Check gridSpan (horizontal merge)
- grid_span = 1
- if tcPr is not None:
- gs = tcPr.find(qn("w:gridSpan"))
- if gs is not None:
- grid_span = int(gs.get(qn("w:val")))
- # Check vMerge (vertical merge)
- vmerge_elem = None
- vmerge_val = None
- if tcPr is not None:
- vmerge_elem = tcPr.find(qn("w:vMerge"))
- if vmerge_elem is not None:
- vmerge_val = vmerge_elem.get(
- qn("w:val")
- ) # 'restart' or None (means 'continue')
- # Determine vMerge status
- is_vmerge_restart = vmerge_elem is not None and vmerge_val == "restart"
- is_vmerge_continue = vmerge_elem is not None and vmerge_val in (
- None,
- "continue",
- )
- is_normal_cell = vmerge_elem is None
- cell_text = ""
- cell_para_id = None
- cell_para_id_end = None # Track last paraId in cell
- # Handle different vMerge cases
- if is_vmerge_restart or is_normal_cell:
- # Extract content for restart or normal cells
- # Get cell text with numbering support and format preservation
- if numbering_resolver is not None:
- # Extract text with numbering labels and superscript/subscript markup
- cell_paragraphs = []
- for para_elem in tc.findall(qn("w:p")):
- # Capture paraId from each paragraph
- para_id_attr = para_elem.get(
- "{http://schemas.microsoft.com/office/word/2010/wordml}paraId"
- )
- if para_id_attr:
- if cell_para_id is None:
- cell_para_id = para_id_attr # First paraId
- cell_para_id_end = (
- para_id_attr # Always update to get last
- )
- # Get text content with format preservation (superscript/subscript/equations)
- para_text = extract_paragraph_content_table(
- para_elem,
- qn,
- drawing_context=drawing_context,
- )
- # Get numbering label
- label = numbering_resolver.get_label(para_elem)
- # Combine label and text
- if label:
- full_text = f"{label} {para_text}".strip()
- else:
- full_text = para_text.strip()
- if full_text:
- cell_paragraphs.append(full_text)
- cell_text = "\n".join(cell_paragraphs).replace("\x07", "")
- else:
- # Fallback to simple text extraction with format preservation
- # Cannot use cell.text here, must extract from XML
- para_texts = []
- for para_elem in tc.findall(qn("w:p")):
- # Capture paraId from each paragraph
- para_id_attr = para_elem.get(
- "{http://schemas.microsoft.com/office/word/2010/wordml}paraId"
- )
- if para_id_attr:
- if cell_para_id is None:
- cell_para_id = para_id_attr # First paraId
- cell_para_id_end = (
- para_id_attr # Always update to get last
- )
- # Extract text with format preservation (superscript/subscript/equations)
- para_text = extract_paragraph_content_table(
- para_elem,
- qn,
- drawing_context=drawing_context,
- )
- if para_text:
- para_texts.append(para_text.strip())
- cell_text = "\n".join(para_texts).replace("\x07", "")
- # Store content and paraIds for vMerge restart
- if is_vmerge_restart:
- vmerge_content[grid_col] = {
- "text": cell_text,
- "para_id": cell_para_id,
- "para_id_end": cell_para_id_end,
- }
- elif is_normal_cell:
- # For normal cells: if empty and we have active vMerge, copy all from start
- # If non-empty, this ends the vMerge region
- if not cell_text and grid_col in vmerge_content:
- # Empty cell in vMerge region - copy content and paraIds from start
- cell_text = vmerge_content[grid_col]["text"]
- cell_para_id = vmerge_content[grid_col]["para_id"]
- cell_para_id_end = vmerge_content[grid_col]["para_id_end"]
- elif cell_text:
- # Non-empty cell - this ends the vMerge for this column
- vmerge_content.pop(grid_col, None)
- elif is_vmerge_continue:
- # Copy content and para_id from previous merge start
- # But extract actual para_id_end from this continue cell for range boundary
- if grid_col in vmerge_content:
- cell_text = vmerge_content[grid_col]["text"]
- cell_para_id = vmerge_content[grid_col][
- "para_id"
- ] # Use restart's paraId for edit targeting
- # Extract actual paraId from this continue cell for uuid_end (range boundary)
- for para_elem in tc.findall(qn("w:p")):
- para_id_attr = para_elem.get(
- "{http://schemas.microsoft.com/office/word/2010/wordml}paraId"
- )
- if para_id_attr:
- cell_para_id_end = (
- para_id_attr # Use actual paraId for range boundary
- )
- # Place content at starting grid position only
- if grid_col < num_cols:
- row_data[grid_col] = cell_text
- row_para_ids[grid_col] = cell_para_id
- row_para_ids_end[grid_col] = cell_para_id_end
- # Move grid position by gridSpan
- grid_col += grid_span
- grid.append(row_data)
- para_ids_grid.append(row_para_ids)
- para_ids_end_grid.append(row_para_ids_end)
- return {
- "rows": grid,
- "para_ids": para_ids_grid,
- "para_ids_end": para_ids_end_grid,
- "header_indices": header_indices,
- }
|