| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423 |
- #!/usr/bin/env python3
- """
- ABOUTME: Resolves automatic numbering labels from DOCX documents
- ABOUTME: Parses numbering.xml and computes rendered number strings
- """
- import zipfile
- from defusedxml import ElementTree as ET
- from typing import Dict
- NSMAP = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
- class NumberingResolver:
- """
- Resolves paragraph numbering to rendered label strings.
- DOCX stores numbering definitions in numbering.xml:
- - abstractNum: Defines format templates (lvlText like "%1.%2.")
- - num: Links numId to abstractNumId
- Each paragraph references: numId (which definition) + ilvl (which level)
- """
- # Number format converters
- FORMAT_CONVERTERS = {
- "decimal": lambda n: str(n),
- "lowerLetter": lambda n: chr(ord("a") + (n - 1) % 26),
- "upperLetter": lambda n: chr(ord("A") + (n - 1) % 26),
- "lowerRoman": lambda n: NumberingResolver._to_roman(n).lower(),
- "upperRoman": lambda n: NumberingResolver._to_roman(n),
- "chineseCountingThousand": lambda n: NumberingResolver._to_chinese(n),
- "ideographTraditional": lambda n: "甲乙丙丁戊己庚辛壬癸"[(n - 1) % 10],
- "bullet": lambda n: "•",
- "none": lambda n: "",
- }
- def __init__(self, docx_path: str):
- self.abstract_nums: Dict[str, dict] = {} # abstractNumId -> level definitions
- self.num_to_abstract: Dict[str, str] = {} # numId -> abstractNumId
- self.counters: Dict[
- str, Dict[int, int]
- ] = {} # numId -> {ilvl -> current_count}
- self.start_overrides: Dict[
- str, Dict[int, int]
- ] = {} # numId -> {ilvl -> start_value}
- self.style_numpr: Dict[
- str, dict
- ] = {} # styleId -> {numId, ilvl} from styles.xml
- self.style_based_on: Dict[str, str] = {} # styleId -> basedOn styleId
- # Smart numbering merge state (Word's rendering behavior)
- self.last_numId: str = None # Previous paragraph's numId
- self.last_abstract_id: str = None # Previous paragraph's abstractNumId
- self.last_style_id: str = None # Previous paragraph's style ID
- self._parse_numbering_xml(docx_path)
- self._parse_styles_xml(docx_path)
- def _parse_numbering_xml(self, docx_path: str):
- """Parse numbering.xml from DOCX archive"""
- try:
- with zipfile.ZipFile(docx_path, "r") as zf:
- if "word/numbering.xml" not in zf.namelist():
- return
- tree = ET.parse(zf.open("word/numbering.xml"))
- root = tree.getroot()
- # Parse abstractNum definitions
- for abstract in root.findall(".//w:abstractNum", NSMAP):
- abstract_id = abstract.get(f'{{{NSMAP["w"]}}}abstractNumId')
- levels = {}
- for lvl in abstract.findall("w:lvl", NSMAP):
- ilvl = int(lvl.get(f'{{{NSMAP["w"]}}}ilvl'))
- start_elem = lvl.find("w:start", NSMAP)
- start = (
- int(start_elem.get(f'{{{NSMAP["w"]}}}val'))
- if start_elem is not None
- else 1
- )
- num_fmt_elem = lvl.find("w:numFmt", NSMAP)
- num_fmt = (
- num_fmt_elem.get(f'{{{NSMAP["w"]}}}val')
- if num_fmt_elem is not None
- else "decimal"
- )
- lvl_text_elem = lvl.find("w:lvlText", NSMAP)
- lvl_text = (
- lvl_text_elem.get(f'{{{NSMAP["w"]}}}val')
- if lvl_text_elem is not None
- else "%1."
- )
- is_lgl_elem = lvl.find("w:isLgl", NSMAP)
- is_lgl = False
- if is_lgl_elem is not None:
- val = is_lgl_elem.get(f'{{{NSMAP["w"]}}}val')
- is_lgl = val is None or val not in ("0", "false")
- levels[ilvl] = {
- "start": start,
- "numFmt": num_fmt,
- "lvlText": lvl_text,
- "isLgl": is_lgl,
- }
- self.abstract_nums[abstract_id] = levels
- # Parse num -> abstractNum mapping and startOverride
- for num in root.findall(".//w:num", NSMAP):
- num_id = num.get(f'{{{NSMAP["w"]}}}numId')
- abstract_ref = num.find("w:abstractNumId", NSMAP)
- if abstract_ref is not None:
- self.num_to_abstract[num_id] = abstract_ref.get(
- f'{{{NSMAP["w"]}}}val'
- )
- # Parse lvlOverride/startOverride for this num
- for lvl_override in num.findall("w:lvlOverride", NSMAP):
- ilvl = int(lvl_override.get(f'{{{NSMAP["w"]}}}ilvl'))
- start_override = lvl_override.find("w:startOverride", NSMAP)
- if start_override is not None:
- start_val = int(start_override.get(f'{{{NSMAP["w"]}}}val'))
- if num_id not in self.start_overrides:
- self.start_overrides[num_id] = {}
- self.start_overrides[num_id][ilvl] = start_val
- except Exception:
- # Silently ignore parsing errors - document may not have numbering
- pass
- def _parse_styles_xml(self, docx_path: str):
- """Parse styles.xml to get style-inherited numbering definitions"""
- try:
- with zipfile.ZipFile(docx_path, "r") as zf:
- if "word/styles.xml" not in zf.namelist():
- return
- tree = ET.parse(zf.open("word/styles.xml"))
- root = tree.getroot()
- # Parse style definitions
- for style in root.findall(".//w:style", NSMAP):
- style_id = style.get(f'{{{NSMAP["w"]}}}styleId')
- if not style_id:
- continue
- # Check for basedOn (style inheritance)
- based_on = style.find("w:basedOn", NSMAP)
- if based_on is not None:
- parent_id = based_on.get(f'{{{NSMAP["w"]}}}val')
- if parent_id:
- self.style_based_on[style_id] = parent_id
- # Check for numPr in style's pPr
- pPr = style.find("w:pPr", NSMAP)
- if pPr is not None:
- numPr = pPr.find("w:numPr", NSMAP)
- if numPr is not None:
- num_id_elem = numPr.find("w:numId", NSMAP)
- ilvl_elem = numPr.find("w:ilvl", NSMAP)
- if num_id_elem is not None:
- num_id = num_id_elem.get(f'{{{NSMAP["w"]}}}val')
- ilvl = (
- int(ilvl_elem.get(f'{{{NSMAP["w"]}}}val'))
- if ilvl_elem is not None
- else 0
- )
- self.style_numpr[style_id] = {
- "numId": num_id,
- "ilvl": ilvl,
- }
- except Exception:
- # Silently ignore parsing errors
- pass
- def _get_numbering_from_style(self, style_id: str, visited=None) -> dict:
- """
- Get numbering definition from style, following inheritance chain.
- Args:
- style_id: Style ID to look up
- visited: Set of visited style IDs (to prevent circular references)
- Returns:
- dict with 'numId' and 'ilvl', or None
- """
- if visited is None:
- visited = set()
- # Prevent circular references
- if style_id in visited:
- return None
- visited.add(style_id)
- # Check if this style has numPr
- if style_id in self.style_numpr:
- return self.style_numpr[style_id]
- # Check parent style
- if style_id in self.style_based_on:
- parent_id = self.style_based_on[style_id]
- return self._get_numbering_from_style(parent_id, visited)
- return None
- def reset_tracking_state(self):
- """
- Reset numbering tracking state.
- Call this when encountering structural breaks that should
- interrupt numbering continuity:
- - Section breaks (sectPr)
- - Table boundaries (before and after tables)
- This prevents incorrect numbering continuation across
- document structure boundaries.
- """
- self.last_numId = None
- self.last_abstract_id = None
- self.last_style_id = None
- def get_label(self, para_element) -> str:
- """
- Get rendered numbering label for a paragraph.
- Checks both direct numPr and style-inherited numbering. Direct numPr
- is a paragraph-local override and applies only to the current
- paragraph; subsequent paragraphs that carry only pStyle fall back to
- the style's numPr declared in styles.xml.
- Args:
- para_element: lxml Element for <w:p>
- Returns:
- Rendered label string (e.g., "1.1", "a)", "第一章") or empty string
- """
- try:
- pPr = para_element.find(f'{{{NSMAP["w"]}}}pPr')
- if pPr is None:
- return ""
- num_id = None
- ilvl = 0
- style_id = None
- # Get pStyle (if present)
- pStyle = pPr.find(f'{{{NSMAP["w"]}}}pStyle')
- if pStyle is not None:
- style_id = pStyle.get(f'{{{NSMAP["w"]}}}val')
- # Check for direct numPr in paragraph
- numPr = pPr.find(f'{{{NSMAP["w"]}}}numPr')
- if numPr is not None:
- num_id_elem = numPr.find(f'{{{NSMAP["w"]}}}numId')
- ilvl_elem = numPr.find(f'{{{NSMAP["w"]}}}ilvl')
- if num_id_elem is not None:
- num_id = num_id_elem.get(f'{{{NSMAP["w"]}}}val')
- ilvl = (
- int(ilvl_elem.get(f'{{{NSMAP["w"]}}}val'))
- if ilvl_elem is not None
- else 0
- )
- # If no direct numPr, fall back to style-inherited numbering.
- # Direct numPr is a paragraph-local override in Word; it must not
- # persist as a runtime default for the style, otherwise subsequent
- # paragraphs that only carry pStyle will keep following the local
- # override instead of the style's declared numPr.
- if num_id is None and style_id:
- style_num = self._get_numbering_from_style(style_id)
- if style_num:
- num_id = style_num["numId"]
- ilvl = style_num["ilvl"]
- # If still no numbering found, clear state and return empty
- if num_id is None:
- # We should use list structure breaking logic to reset last_numId, last_abstract_id and last_style_id
- return ""
- # Get abstract definition
- abstract_id = self.num_to_abstract.get(num_id)
- if abstract_id is None or abstract_id not in self.abstract_nums:
- # Clear state for invalid numbering
- self.last_numId = None
- self.last_abstract_id = None
- return ""
- levels = self.abstract_nums[abstract_id]
- if ilvl not in levels:
- # Clear state for invalid level
- self.last_numId = None
- self.last_abstract_id = None
- return ""
- # Smart numbering merge: (Word's rendering behavior)
- # When consecutive paragraphs have different numId but same abstractNumId,
- # Word continues the numbering sequence rather than restarting.
- # This happens regardless of whether the numId is new or style matches.
- if (
- self.last_numId is not None
- and self.last_numId != num_id
- and self.last_abstract_id == abstract_id
- and self.last_numId in self.counters
- ):
- # Merge: copy previous numId's counter to current numId
- self.counters[num_id] = self.counters[self.last_numId].copy()
- # Initialize/update counter
- if num_id not in self.counters:
- self.counters[num_id] = {}
- # Initialize all parent levels if not present (for deep nested numbering)
- for i in range(ilvl):
- if i not in self.counters[num_id] and i in levels:
- # Use startOverride if exists, otherwise use abstractNum's start value
- if (
- num_id in self.start_overrides
- and i in self.start_overrides[num_id]
- ):
- self.counters[num_id][i] = self.start_overrides[num_id][i]
- else:
- self.counters[num_id][i] = levels[i]["start"]
- # Reset lower levels when higher level increments
- for i in range(ilvl + 1, 10):
- if i in self.counters[num_id]:
- del self.counters[num_id][i]
- # Initialize current level if needed
- if ilvl not in self.counters[num_id]:
- # Use startOverride if exists, otherwise use abstractNum's start value
- if (
- num_id in self.start_overrides
- and ilvl in self.start_overrides[num_id]
- ):
- self.counters[num_id][ilvl] = self.start_overrides[num_id][ilvl]
- else:
- self.counters[num_id][ilvl] = levels[ilvl]["start"]
- else:
- self.counters[num_id][ilvl] += 1
- # Format the label using lvlText template
- label = self._format_label(num_id, ilvl, levels)
- # Update tracking state for next paragraph
- self.last_numId = num_id
- self.last_abstract_id = abstract_id
- self.last_style_id = style_id
- return label
- except Exception:
- # Return empty on any error to avoid breaking document parsing
- return ""
- def _format_label(self, num_id: str, ilvl: int, levels: dict) -> str:
- """Format label string by replacing %1, %2, etc."""
- try:
- lvl_text = levels[ilvl]["lvlText"]
- result = lvl_text
- current_is_lgl = levels[ilvl].get("isLgl", False)
- for i in range(ilvl + 1):
- if i in levels and i in self.counters.get(num_id, {}):
- num_fmt = levels[i]["numFmt"]
- if current_is_lgl and i < ilvl:
- num_fmt = "decimal"
- count = self.counters[num_id][i]
- converter = self.FORMAT_CONVERTERS.get(num_fmt, lambda n: str(n))
- formatted = converter(count)
- result = result.replace(f"%{i+1}", formatted)
- return result
- except Exception:
- return ""
- @staticmethod
- def _to_roman(n: int) -> str:
- """Convert integer to Roman numeral"""
- if n <= 0 or n >= 4000:
- return str(n)
- values = [
- (1000, "M"),
- (900, "CM"),
- (500, "D"),
- (400, "CD"),
- (100, "C"),
- (90, "XC"),
- (50, "L"),
- (40, "XL"),
- (10, "X"),
- (9, "IX"),
- (5, "V"),
- (4, "IV"),
- (1, "I"),
- ]
- result = ""
- for value, numeral in values:
- while n >= value:
- result += numeral
- n -= value
- return result
- @staticmethod
- def _to_chinese(n: int) -> str:
- """Convert integer to Chinese numeral"""
- digits = "零一二三四五六七八九"
- if n <= 0 or n > 99:
- return str(n)
- if n < 10:
- return digits[n]
- if n < 20:
- return "十" + (digits[n % 10] if n % 10 else "")
- if n < 100:
- tens = n // 10
- ones = n % 10
- return digits[tens] + "十" + (digits[ones] if ones else "")
- return str(n)
|