drawing_image_extractor.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. #!/usr/bin/env python3
  2. """
  3. ABOUTME: Shared drawing/image extraction utilities for DOCX parsing and editing
  4. ABOUTME: Resolves w:drawing -> a:blip relationships, exports embedded images, builds placeholders
  5. """
  6. from __future__ import annotations
  7. import posixpath
  8. import re
  9. import shutil
  10. import zipfile
  11. from dataclasses import dataclass, field
  12. from html import escape, unescape
  13. from pathlib import Path, PurePosixPath
  14. from typing import Dict, Optional, Tuple
  15. from urllib.parse import urlparse
  16. try:
  17. from defusedxml import ElementTree as ET
  18. except ImportError: # pragma: no cover
  19. from xml.etree import ElementTree as ET
  20. NS = {
  21. "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
  22. "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
  23. "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
  24. "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
  25. "v": "urn:schemas-microsoft-com:vml",
  26. }
  27. REL_NS = "http://schemas.openxmlformats.org/package/2006/relationships"
  28. CONTENT_TYPE_NS = "http://schemas.openxmlformats.org/package/2006/content-types"
  29. IMAGE_REL_TYPE = (
  30. "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"
  31. )
  32. SOURCE_DOCUMENT_PART = "/word/document.xml"
  33. # Match old and new drawing placeholders (requires id/name, allows extra attributes)
  34. DRAWING_PATTERN = re.compile(
  35. r'<drawing\b(?=[^>]*\bid="[^"]*")(?=[^>]*\bname="[^"]*")[^>]*/>'
  36. )
  37. DRAWING_TAG_PATTERN = re.compile(r"<drawing\b[^>]*/>")
  38. DRAWING_ATTR_PATTERN = re.compile(r'([a-zA-Z_][\w:.-]*)="([^"]*)"')
  39. @dataclass
  40. class DrawingRelationship:
  41. """Relationship metadata for a single relationship ID."""
  42. rel_id: str
  43. target: str
  44. target_mode: str
  45. rel_type: str
  46. part_name: Optional[str] = None
  47. content_type: Optional[str] = None
  48. image_format: Optional[str] = None
  49. @dataclass
  50. class DrawingExtractionContext:
  51. """Context used to resolve and export drawing images for one DOCX file."""
  52. docx_path: Path
  53. blocks_output_path: Optional[Path] = None
  54. export_dir_name: Optional[str] = None
  55. export_dir_path: Optional[Path] = None
  56. relationships: Dict[str, DrawingRelationship] = field(default_factory=dict)
  57. _exported_part_to_relpath: Dict[str, str] = field(default_factory=dict)
  58. _used_filenames: Dict[str, str] = field(default_factory=dict)
  59. def resolve_relationship(self, rel_id: str) -> Optional[DrawingRelationship]:
  60. return self.relationships.get(rel_id)
  61. def export_embedded_image(self, rel: DrawingRelationship) -> Optional[str]:
  62. """
  63. Export an embedded image relationship target to export_dir.
  64. Returns:
  65. Relative path like "<blocks_stem>.image/image1.png" if exported,
  66. or None when export is not applicable.
  67. """
  68. if not self.export_dir_path or not self.export_dir_name:
  69. return None
  70. if rel.target_mode.lower() == "external":
  71. return None
  72. if not rel.part_name:
  73. return None
  74. if rel.part_name in self._exported_part_to_relpath:
  75. return self._exported_part_to_relpath[rel.part_name]
  76. zip_member = rel.part_name.lstrip("/")
  77. try:
  78. with zipfile.ZipFile(self.docx_path, "r") as zf:
  79. blob = zf.read(zip_member)
  80. except Exception:
  81. return None
  82. filename = self._dedupe_filename(PurePosixPath(rel.part_name).name or "image")
  83. output_file = self.export_dir_path / filename
  84. output_file.write_bytes(blob)
  85. rel_path = str(PurePosixPath(self.export_dir_name) / filename)
  86. self._exported_part_to_relpath[rel.part_name] = rel_path
  87. return rel_path
  88. def _dedupe_filename(self, base_name: str) -> str:
  89. if base_name not in self._used_filenames:
  90. self._used_filenames[base_name] = base_name
  91. return base_name
  92. stem = Path(base_name).stem
  93. suffix = Path(base_name).suffix
  94. index = 2
  95. while True:
  96. candidate = f"{stem}_{index}{suffix}"
  97. if candidate not in self._used_filenames:
  98. self._used_filenames[candidate] = candidate
  99. return candidate
  100. index += 1
  101. def _normalize_image_format(ext_or_type: str) -> Optional[str]:
  102. if not ext_or_type:
  103. return None
  104. value = ext_or_type.strip().lower()
  105. # Content-Type
  106. if value.startswith("image/"):
  107. value = value.split("/", 1)[1]
  108. if "+" in value:
  109. value = value.split("+", 1)[0]
  110. if value.startswith("x-"):
  111. value = value[2:]
  112. # Extension (with or without leading dot)
  113. value = value.lstrip(".")
  114. if value == "jpg":
  115. return "jpeg"
  116. if value in {"jpeg", "png", "gif", "bmp", "tiff", "webp", "svg", "emf", "wmf"}:
  117. return value
  118. return value or None
  119. def _infer_format_from_target(target: str) -> Optional[str]:
  120. if not target:
  121. return None
  122. parsed = urlparse(target)
  123. path = parsed.path if parsed.scheme else target
  124. suffix = PurePosixPath(path).suffix
  125. return _normalize_image_format(suffix)
  126. def _resolve_part_name(source_part_name: str, target: str) -> str:
  127. if target.startswith("/"):
  128. return posixpath.normpath(target)
  129. source_dir = posixpath.dirname(source_part_name)
  130. joined = posixpath.join(source_dir, target)
  131. normalized = posixpath.normpath(joined)
  132. if not normalized.startswith("/"):
  133. normalized = "/" + normalized
  134. return normalized
  135. def create_drawing_context(
  136. docx_path: str,
  137. blocks_output_path: Optional[str] = None,
  138. ) -> DrawingExtractionContext:
  139. """
  140. Create extraction context for a DOCX file.
  141. If blocks_output_path is provided, this also prepares `<blocks_stem>.image/`
  142. beside the blocks file and clears any previous content.
  143. """
  144. docx_file = Path(docx_path)
  145. ctx = DrawingExtractionContext(docx_path=docx_file)
  146. if blocks_output_path:
  147. output_path = Path(blocks_output_path)
  148. export_dir_name = f"{output_path.stem}.image"
  149. export_dir_path = output_path.parent / export_dir_name
  150. if export_dir_path.exists():
  151. shutil.rmtree(export_dir_path)
  152. export_dir_path.mkdir(parents=True, exist_ok=True)
  153. ctx.blocks_output_path = output_path
  154. ctx.export_dir_name = export_dir_name
  155. ctx.export_dir_path = export_dir_path
  156. load_relationships(ctx)
  157. return ctx
  158. def load_relationships(ctx: DrawingExtractionContext) -> None:
  159. rels_xml = "word/_rels/document.xml.rels"
  160. content_types_xml = "[Content_Types].xml"
  161. overrides: Dict[str, str] = {}
  162. defaults: Dict[str, str] = {}
  163. try:
  164. with zipfile.ZipFile(ctx.docx_path, "r") as zf:
  165. if content_types_xml in zf.namelist():
  166. ct_root = ET.parse(zf.open(content_types_xml)).getroot()
  167. for node in ct_root.findall(f".//{{{CONTENT_TYPE_NS}}}Override"):
  168. part_name = node.get("PartName")
  169. content_type = node.get("ContentType")
  170. if part_name and content_type:
  171. overrides[part_name] = content_type
  172. for node in ct_root.findall(f".//{{{CONTENT_TYPE_NS}}}Default"):
  173. ext = node.get("Extension")
  174. content_type = node.get("ContentType")
  175. if ext and content_type:
  176. defaults[ext.lower()] = content_type
  177. if rels_xml not in zf.namelist():
  178. return
  179. rels_root = ET.parse(zf.open(rels_xml)).getroot()
  180. except Exception:
  181. return
  182. for rel in rels_root.findall(f".//{{{REL_NS}}}Relationship"):
  183. rel_id = rel.get("Id")
  184. target = rel.get("Target", "")
  185. target_mode = rel.get("TargetMode", "")
  186. rel_type = rel.get("Type", "")
  187. if not rel_id:
  188. continue
  189. part_name = None
  190. content_type = None
  191. image_format = None
  192. if target_mode.lower() != "external":
  193. part_name = _resolve_part_name(SOURCE_DOCUMENT_PART, target)
  194. if part_name:
  195. content_type = overrides.get(part_name)
  196. if not content_type:
  197. ext = PurePosixPath(part_name).suffix.lower().lstrip(".")
  198. content_type = defaults.get(ext)
  199. image_format = _normalize_image_format(
  200. content_type or _infer_format_from_target(part_name)
  201. )
  202. else:
  203. image_format = _normalize_image_format(_infer_format_from_target(target))
  204. ctx.relationships[rel_id] = DrawingRelationship(
  205. rel_id=rel_id,
  206. target=target,
  207. target_mode=target_mode,
  208. rel_type=rel_type,
  209. part_name=part_name,
  210. content_type=content_type,
  211. image_format=image_format,
  212. )
  213. def _extract_blip_relationship(drawing_elem) -> Optional[Tuple[str, str]]:
  214. for blip in drawing_elem.findall(".//a:blip", NS):
  215. # Prefer explicit external links when both link/embed are present on one blip.
  216. # Word may keep an embedded cache for linked pictures.
  217. rel_link = blip.get(f"{{{NS['r']}}}link")
  218. if rel_link:
  219. return "link", rel_link
  220. rel_embed = blip.get(f"{{{NS['r']}}}embed")
  221. if rel_embed:
  222. return "embed", rel_embed
  223. return None
  224. def _extract_imagedata_relationship(container_elem) -> Optional[str]:
  225. """Find an image relationship id from a w:pict / w:object via v:imagedata.
  226. These legacy VML containers are how Word references EMF/WMF metafiles
  227. (and the rendered preview of any embedded OLE object). v:imagedata uses
  228. ``r:id`` to point at the image part for both embedded and externally
  229. linked images — the relationship's ``TargetMode`` is what disambiguates
  230. the two cases, so the caller must inspect the resolved relationship.
  231. """
  232. r_id_attr = f"{{{NS['r']}}}id"
  233. for imgdata in container_elem.findall(".//v:imagedata", NS):
  234. rel_id = imgdata.get(r_id_attr)
  235. if rel_id:
  236. return rel_id
  237. return None
  238. def _build_placeholder(attrs: Dict[str, str]) -> str:
  239. ordered_keys = ["id", "name", "path", "format"]
  240. pieces = []
  241. for key in ordered_keys:
  242. if key in attrs and attrs[key] is not None:
  243. pieces.append(f'{key}="{escape(str(attrs[key]), quote=True)}"')
  244. # Preserve extra attributes deterministically (sorted by name)
  245. for key in sorted(k for k in attrs.keys() if k not in ordered_keys):
  246. value = attrs[key]
  247. if value is not None:
  248. pieces.append(f'{key}="{escape(str(value), quote=True)}"')
  249. return f"<drawing {' '.join(pieces)} />"
  250. def extract_drawing_placeholder_from_element(
  251. drawing_elem,
  252. context: Optional[DrawingExtractionContext] = None,
  253. include_extended_attrs: bool = True,
  254. ) -> str:
  255. """
  256. Build a <drawing ... /> placeholder from a w:drawing element.
  257. Behavior:
  258. - Always emits id/name from wp:docPr when present.
  259. - For embedded images (a:blip@r:embed): exports image and sets path/format.
  260. - For linked images (a:blip@r:link): does not download; path is original link target.
  261. - When no image reference exists (e.g. chart drawing): keeps id/name only.
  262. """
  263. doc_pr = drawing_elem.find(".//wp:docPr", NS)
  264. attrs = {
  265. "id": doc_pr.get("id", "") if doc_pr is not None else "",
  266. "name": doc_pr.get("name", "") if doc_pr is not None else "",
  267. }
  268. if include_extended_attrs:
  269. rel_ref = _extract_blip_relationship(drawing_elem)
  270. if rel_ref is not None and context is not None:
  271. rel_kind, rel_id = rel_ref
  272. rel = context.resolve_relationship(rel_id)
  273. if rel is not None:
  274. if rel_kind == "embed" and rel.rel_type == IMAGE_REL_TYPE:
  275. rel_path = context.export_embedded_image(rel)
  276. if rel_path:
  277. attrs["path"] = rel_path
  278. if rel.image_format:
  279. attrs["format"] = rel.image_format
  280. elif rel_kind == "link":
  281. if rel.target:
  282. attrs["path"] = rel.target
  283. if rel.image_format:
  284. attrs["format"] = rel.image_format
  285. return _build_placeholder(attrs)
  286. def extract_vml_image_placeholder_from_element(
  287. container_elem,
  288. context: Optional[DrawingExtractionContext] = None,
  289. include_extended_attrs: bool = True,
  290. ) -> str:
  291. """
  292. Build a <drawing ... /> placeholder from a w:pict or w:object element.
  293. Legacy Word documents and OLE-embedded objects (Visio diagrams, equation
  294. editor previews, etc.) expose their rendered image via VML rather than
  295. DrawingML. The image is referenced through ``<v:imagedata r:id="..."/>``
  296. inside ``<v:shape>``, and the underlying bytes are commonly EMF/WMF
  297. metafiles. This function exports those bytes through the same context as
  298. DrawingML images so EMF/WMF assets land in the blocks.assets directory
  299. alongside PNG/JPEG ones.
  300. The output placeholder format matches
  301. ``extract_drawing_placeholder_from_element`` so downstream consumers
  302. treat both paths uniformly.
  303. """
  304. shape = container_elem.find(".//v:shape", NS)
  305. attrs = {
  306. "id": shape.get("id", "") if shape is not None else "",
  307. "name": shape.get("alt", "") if shape is not None else "",
  308. }
  309. if include_extended_attrs:
  310. rel_id = _extract_imagedata_relationship(container_elem)
  311. if rel_id and context is not None:
  312. rel = context.resolve_relationship(rel_id)
  313. if rel is not None and rel.rel_type == IMAGE_REL_TYPE:
  314. # VML reuses r:id for both embedded image parts and externally
  315. # linked images; only the resolved TargetMode tells us which.
  316. # Treating an external relationship as embedded would call
  317. # export_embedded_image() (which short-circuits on external)
  318. # and silently drop the linked path.
  319. if rel.target_mode.lower() == "external":
  320. if rel.target:
  321. attrs["path"] = rel.target
  322. if rel.image_format:
  323. attrs["format"] = rel.image_format
  324. else:
  325. rel_path = context.export_embedded_image(rel)
  326. if rel_path:
  327. attrs["path"] = rel_path
  328. if rel.image_format:
  329. attrs["format"] = rel.image_format
  330. return _build_placeholder(attrs)
  331. def parse_drawing_attributes(placeholder: str) -> Dict[str, str]:
  332. """Parse attributes from a <drawing ... /> placeholder."""
  333. return {
  334. name: unescape(value)
  335. for name, value in DRAWING_ATTR_PATTERN.findall(placeholder)
  336. }
  337. def normalize_drawing_placeholder(
  338. placeholder: str,
  339. include_extended_attrs: bool = False,
  340. ) -> str:
  341. """
  342. Normalize one drawing placeholder into canonical attribute order.
  343. Args:
  344. placeholder: Input placeholder string
  345. include_extended_attrs: If False, keeps only id/name.
  346. """
  347. attrs = parse_drawing_attributes(placeholder)
  348. normalized = {
  349. "id": attrs.get("id", ""),
  350. "name": attrs.get("name", ""),
  351. }
  352. if include_extended_attrs:
  353. if "path" in attrs:
  354. normalized["path"] = attrs["path"]
  355. if "format" in attrs:
  356. normalized["format"] = attrs["format"]
  357. for key, value in attrs.items():
  358. if key not in {"id", "name", "path", "format"}:
  359. normalized[key] = value
  360. return _build_placeholder(normalized)
  361. def normalize_drawing_placeholders_in_text(
  362. text: str,
  363. include_extended_attrs: bool = False,
  364. ) -> str:
  365. """Normalize all drawing placeholders inside a text blob."""
  366. if not text:
  367. return text
  368. def _replace(match: re.Match) -> str:
  369. return normalize_drawing_placeholder(
  370. match.group(0),
  371. include_extended_attrs=include_extended_attrs,
  372. )
  373. return DRAWING_TAG_PATTERN.sub(_replace, text)