numbering_resolver.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423
  1. #!/usr/bin/env python3
  2. """
  3. ABOUTME: Resolves automatic numbering labels from DOCX documents
  4. ABOUTME: Parses numbering.xml and computes rendered number strings
  5. """
  6. import zipfile
  7. from defusedxml import ElementTree as ET
  8. from typing import Dict
  9. NSMAP = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
  10. class NumberingResolver:
  11. """
  12. Resolves paragraph numbering to rendered label strings.
  13. DOCX stores numbering definitions in numbering.xml:
  14. - abstractNum: Defines format templates (lvlText like "%1.%2.")
  15. - num: Links numId to abstractNumId
  16. Each paragraph references: numId (which definition) + ilvl (which level)
  17. """
  18. # Number format converters
  19. FORMAT_CONVERTERS = {
  20. "decimal": lambda n: str(n),
  21. "lowerLetter": lambda n: chr(ord("a") + (n - 1) % 26),
  22. "upperLetter": lambda n: chr(ord("A") + (n - 1) % 26),
  23. "lowerRoman": lambda n: NumberingResolver._to_roman(n).lower(),
  24. "upperRoman": lambda n: NumberingResolver._to_roman(n),
  25. "chineseCountingThousand": lambda n: NumberingResolver._to_chinese(n),
  26. "ideographTraditional": lambda n: "甲乙丙丁戊己庚辛壬癸"[(n - 1) % 10],
  27. "bullet": lambda n: "•",
  28. "none": lambda n: "",
  29. }
  30. def __init__(self, docx_path: str):
  31. self.abstract_nums: Dict[str, dict] = {} # abstractNumId -> level definitions
  32. self.num_to_abstract: Dict[str, str] = {} # numId -> abstractNumId
  33. self.counters: Dict[
  34. str, Dict[int, int]
  35. ] = {} # numId -> {ilvl -> current_count}
  36. self.start_overrides: Dict[
  37. str, Dict[int, int]
  38. ] = {} # numId -> {ilvl -> start_value}
  39. self.style_numpr: Dict[
  40. str, dict
  41. ] = {} # styleId -> {numId, ilvl} from styles.xml
  42. self.style_based_on: Dict[str, str] = {} # styleId -> basedOn styleId
  43. # Smart numbering merge state (Word's rendering behavior)
  44. self.last_numId: str = None # Previous paragraph's numId
  45. self.last_abstract_id: str = None # Previous paragraph's abstractNumId
  46. self.last_style_id: str = None # Previous paragraph's style ID
  47. self._parse_numbering_xml(docx_path)
  48. self._parse_styles_xml(docx_path)
  49. def _parse_numbering_xml(self, docx_path: str):
  50. """Parse numbering.xml from DOCX archive"""
  51. try:
  52. with zipfile.ZipFile(docx_path, "r") as zf:
  53. if "word/numbering.xml" not in zf.namelist():
  54. return
  55. tree = ET.parse(zf.open("word/numbering.xml"))
  56. root = tree.getroot()
  57. # Parse abstractNum definitions
  58. for abstract in root.findall(".//w:abstractNum", NSMAP):
  59. abstract_id = abstract.get(f'{{{NSMAP["w"]}}}abstractNumId')
  60. levels = {}
  61. for lvl in abstract.findall("w:lvl", NSMAP):
  62. ilvl = int(lvl.get(f'{{{NSMAP["w"]}}}ilvl'))
  63. start_elem = lvl.find("w:start", NSMAP)
  64. start = (
  65. int(start_elem.get(f'{{{NSMAP["w"]}}}val'))
  66. if start_elem is not None
  67. else 1
  68. )
  69. num_fmt_elem = lvl.find("w:numFmt", NSMAP)
  70. num_fmt = (
  71. num_fmt_elem.get(f'{{{NSMAP["w"]}}}val')
  72. if num_fmt_elem is not None
  73. else "decimal"
  74. )
  75. lvl_text_elem = lvl.find("w:lvlText", NSMAP)
  76. lvl_text = (
  77. lvl_text_elem.get(f'{{{NSMAP["w"]}}}val')
  78. if lvl_text_elem is not None
  79. else "%1."
  80. )
  81. is_lgl_elem = lvl.find("w:isLgl", NSMAP)
  82. is_lgl = False
  83. if is_lgl_elem is not None:
  84. val = is_lgl_elem.get(f'{{{NSMAP["w"]}}}val')
  85. is_lgl = val is None or val not in ("0", "false")
  86. levels[ilvl] = {
  87. "start": start,
  88. "numFmt": num_fmt,
  89. "lvlText": lvl_text,
  90. "isLgl": is_lgl,
  91. }
  92. self.abstract_nums[abstract_id] = levels
  93. # Parse num -> abstractNum mapping and startOverride
  94. for num in root.findall(".//w:num", NSMAP):
  95. num_id = num.get(f'{{{NSMAP["w"]}}}numId')
  96. abstract_ref = num.find("w:abstractNumId", NSMAP)
  97. if abstract_ref is not None:
  98. self.num_to_abstract[num_id] = abstract_ref.get(
  99. f'{{{NSMAP["w"]}}}val'
  100. )
  101. # Parse lvlOverride/startOverride for this num
  102. for lvl_override in num.findall("w:lvlOverride", NSMAP):
  103. ilvl = int(lvl_override.get(f'{{{NSMAP["w"]}}}ilvl'))
  104. start_override = lvl_override.find("w:startOverride", NSMAP)
  105. if start_override is not None:
  106. start_val = int(start_override.get(f'{{{NSMAP["w"]}}}val'))
  107. if num_id not in self.start_overrides:
  108. self.start_overrides[num_id] = {}
  109. self.start_overrides[num_id][ilvl] = start_val
  110. except Exception:
  111. # Silently ignore parsing errors - document may not have numbering
  112. pass
  113. def _parse_styles_xml(self, docx_path: str):
  114. """Parse styles.xml to get style-inherited numbering definitions"""
  115. try:
  116. with zipfile.ZipFile(docx_path, "r") as zf:
  117. if "word/styles.xml" not in zf.namelist():
  118. return
  119. tree = ET.parse(zf.open("word/styles.xml"))
  120. root = tree.getroot()
  121. # Parse style definitions
  122. for style in root.findall(".//w:style", NSMAP):
  123. style_id = style.get(f'{{{NSMAP["w"]}}}styleId')
  124. if not style_id:
  125. continue
  126. # Check for basedOn (style inheritance)
  127. based_on = style.find("w:basedOn", NSMAP)
  128. if based_on is not None:
  129. parent_id = based_on.get(f'{{{NSMAP["w"]}}}val')
  130. if parent_id:
  131. self.style_based_on[style_id] = parent_id
  132. # Check for numPr in style's pPr
  133. pPr = style.find("w:pPr", NSMAP)
  134. if pPr is not None:
  135. numPr = pPr.find("w:numPr", NSMAP)
  136. if numPr is not None:
  137. num_id_elem = numPr.find("w:numId", NSMAP)
  138. ilvl_elem = numPr.find("w:ilvl", NSMAP)
  139. if num_id_elem is not None:
  140. num_id = num_id_elem.get(f'{{{NSMAP["w"]}}}val')
  141. ilvl = (
  142. int(ilvl_elem.get(f'{{{NSMAP["w"]}}}val'))
  143. if ilvl_elem is not None
  144. else 0
  145. )
  146. self.style_numpr[style_id] = {
  147. "numId": num_id,
  148. "ilvl": ilvl,
  149. }
  150. except Exception:
  151. # Silently ignore parsing errors
  152. pass
  153. def _get_numbering_from_style(self, style_id: str, visited=None) -> dict:
  154. """
  155. Get numbering definition from style, following inheritance chain.
  156. Args:
  157. style_id: Style ID to look up
  158. visited: Set of visited style IDs (to prevent circular references)
  159. Returns:
  160. dict with 'numId' and 'ilvl', or None
  161. """
  162. if visited is None:
  163. visited = set()
  164. # Prevent circular references
  165. if style_id in visited:
  166. return None
  167. visited.add(style_id)
  168. # Check if this style has numPr
  169. if style_id in self.style_numpr:
  170. return self.style_numpr[style_id]
  171. # Check parent style
  172. if style_id in self.style_based_on:
  173. parent_id = self.style_based_on[style_id]
  174. return self._get_numbering_from_style(parent_id, visited)
  175. return None
  176. def reset_tracking_state(self):
  177. """
  178. Reset numbering tracking state.
  179. Call this when encountering structural breaks that should
  180. interrupt numbering continuity:
  181. - Section breaks (sectPr)
  182. - Table boundaries (before and after tables)
  183. This prevents incorrect numbering continuation across
  184. document structure boundaries.
  185. """
  186. self.last_numId = None
  187. self.last_abstract_id = None
  188. self.last_style_id = None
  189. def get_label(self, para_element) -> str:
  190. """
  191. Get rendered numbering label for a paragraph.
  192. Checks both direct numPr and style-inherited numbering. Direct numPr
  193. is a paragraph-local override and applies only to the current
  194. paragraph; subsequent paragraphs that carry only pStyle fall back to
  195. the style's numPr declared in styles.xml.
  196. Args:
  197. para_element: lxml Element for <w:p>
  198. Returns:
  199. Rendered label string (e.g., "1.1", "a)", "第一章") or empty string
  200. """
  201. try:
  202. pPr = para_element.find(f'{{{NSMAP["w"]}}}pPr')
  203. if pPr is None:
  204. return ""
  205. num_id = None
  206. ilvl = 0
  207. style_id = None
  208. # Get pStyle (if present)
  209. pStyle = pPr.find(f'{{{NSMAP["w"]}}}pStyle')
  210. if pStyle is not None:
  211. style_id = pStyle.get(f'{{{NSMAP["w"]}}}val')
  212. # Check for direct numPr in paragraph
  213. numPr = pPr.find(f'{{{NSMAP["w"]}}}numPr')
  214. if numPr is not None:
  215. num_id_elem = numPr.find(f'{{{NSMAP["w"]}}}numId')
  216. ilvl_elem = numPr.find(f'{{{NSMAP["w"]}}}ilvl')
  217. if num_id_elem is not None:
  218. num_id = num_id_elem.get(f'{{{NSMAP["w"]}}}val')
  219. ilvl = (
  220. int(ilvl_elem.get(f'{{{NSMAP["w"]}}}val'))
  221. if ilvl_elem is not None
  222. else 0
  223. )
  224. # If no direct numPr, fall back to style-inherited numbering.
  225. # Direct numPr is a paragraph-local override in Word; it must not
  226. # persist as a runtime default for the style, otherwise subsequent
  227. # paragraphs that only carry pStyle will keep following the local
  228. # override instead of the style's declared numPr.
  229. if num_id is None and style_id:
  230. style_num = self._get_numbering_from_style(style_id)
  231. if style_num:
  232. num_id = style_num["numId"]
  233. ilvl = style_num["ilvl"]
  234. # If still no numbering found, clear state and return empty
  235. if num_id is None:
  236. # We should use list structure breaking logic to reset last_numId, last_abstract_id and last_style_id
  237. return ""
  238. # Get abstract definition
  239. abstract_id = self.num_to_abstract.get(num_id)
  240. if abstract_id is None or abstract_id not in self.abstract_nums:
  241. # Clear state for invalid numbering
  242. self.last_numId = None
  243. self.last_abstract_id = None
  244. return ""
  245. levels = self.abstract_nums[abstract_id]
  246. if ilvl not in levels:
  247. # Clear state for invalid level
  248. self.last_numId = None
  249. self.last_abstract_id = None
  250. return ""
  251. # Smart numbering merge: (Word's rendering behavior)
  252. # When consecutive paragraphs have different numId but same abstractNumId,
  253. # Word continues the numbering sequence rather than restarting.
  254. # This happens regardless of whether the numId is new or style matches.
  255. if (
  256. self.last_numId is not None
  257. and self.last_numId != num_id
  258. and self.last_abstract_id == abstract_id
  259. and self.last_numId in self.counters
  260. ):
  261. # Merge: copy previous numId's counter to current numId
  262. self.counters[num_id] = self.counters[self.last_numId].copy()
  263. # Initialize/update counter
  264. if num_id not in self.counters:
  265. self.counters[num_id] = {}
  266. # Initialize all parent levels if not present (for deep nested numbering)
  267. for i in range(ilvl):
  268. if i not in self.counters[num_id] and i in levels:
  269. # Use startOverride if exists, otherwise use abstractNum's start value
  270. if (
  271. num_id in self.start_overrides
  272. and i in self.start_overrides[num_id]
  273. ):
  274. self.counters[num_id][i] = self.start_overrides[num_id][i]
  275. else:
  276. self.counters[num_id][i] = levels[i]["start"]
  277. # Reset lower levels when higher level increments
  278. for i in range(ilvl + 1, 10):
  279. if i in self.counters[num_id]:
  280. del self.counters[num_id][i]
  281. # Initialize current level if needed
  282. if ilvl not in self.counters[num_id]:
  283. # Use startOverride if exists, otherwise use abstractNum's start value
  284. if (
  285. num_id in self.start_overrides
  286. and ilvl in self.start_overrides[num_id]
  287. ):
  288. self.counters[num_id][ilvl] = self.start_overrides[num_id][ilvl]
  289. else:
  290. self.counters[num_id][ilvl] = levels[ilvl]["start"]
  291. else:
  292. self.counters[num_id][ilvl] += 1
  293. # Format the label using lvlText template
  294. label = self._format_label(num_id, ilvl, levels)
  295. # Update tracking state for next paragraph
  296. self.last_numId = num_id
  297. self.last_abstract_id = abstract_id
  298. self.last_style_id = style_id
  299. return label
  300. except Exception:
  301. # Return empty on any error to avoid breaking document parsing
  302. return ""
  303. def _format_label(self, num_id: str, ilvl: int, levels: dict) -> str:
  304. """Format label string by replacing %1, %2, etc."""
  305. try:
  306. lvl_text = levels[ilvl]["lvlText"]
  307. result = lvl_text
  308. current_is_lgl = levels[ilvl].get("isLgl", False)
  309. for i in range(ilvl + 1):
  310. if i in levels and i in self.counters.get(num_id, {}):
  311. num_fmt = levels[i]["numFmt"]
  312. if current_is_lgl and i < ilvl:
  313. num_fmt = "decimal"
  314. count = self.counters[num_id][i]
  315. converter = self.FORMAT_CONVERTERS.get(num_fmt, lambda n: str(n))
  316. formatted = converter(count)
  317. result = result.replace(f"%{i+1}", formatted)
  318. return result
  319. except Exception:
  320. return ""
  321. @staticmethod
  322. def _to_roman(n: int) -> str:
  323. """Convert integer to Roman numeral"""
  324. if n <= 0 or n >= 4000:
  325. return str(n)
  326. values = [
  327. (1000, "M"),
  328. (900, "CM"),
  329. (500, "D"),
  330. (400, "CD"),
  331. (100, "C"),
  332. (90, "XC"),
  333. (50, "L"),
  334. (40, "XL"),
  335. (10, "X"),
  336. (9, "IX"),
  337. (5, "V"),
  338. (4, "IV"),
  339. (1, "I"),
  340. ]
  341. result = ""
  342. for value, numeral in values:
  343. while n >= value:
  344. result += numeral
  345. n -= value
  346. return result
  347. @staticmethod
  348. def _to_chinese(n: int) -> str:
  349. """Convert integer to Chinese numeral"""
  350. digits = "零一二三四五六七八九"
  351. if n <= 0 or n > 99:
  352. return str(n)
  353. if n < 10:
  354. return digits[n]
  355. if n < 20:
  356. return "十" + (digits[n % 10] if n % 10 else "")
  357. if n < 100:
  358. tens = n // 10
  359. ones = n % 10
  360. return digits[tens] + "十" + (digits[ones] if ones else "")
  361. return str(n)