| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445 |
- #!/usr/bin/env python3
- """
- ABOUTME: Shared drawing/image extraction utilities for DOCX parsing and editing
- ABOUTME: Resolves w:drawing -> a:blip relationships, exports embedded images, builds placeholders
- """
- from __future__ import annotations
- import posixpath
- import re
- import shutil
- import zipfile
- from dataclasses import dataclass, field
- from html import escape, unescape
- from pathlib import Path, PurePosixPath
- from typing import Dict, Optional, Tuple
- from urllib.parse import urlparse
- try:
- from defusedxml import ElementTree as ET
- except ImportError: # pragma: no cover
- from xml.etree import ElementTree as ET
- NS = {
- "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
- "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
- "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
- "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
- "v": "urn:schemas-microsoft-com:vml",
- }
- REL_NS = "http://schemas.openxmlformats.org/package/2006/relationships"
- CONTENT_TYPE_NS = "http://schemas.openxmlformats.org/package/2006/content-types"
- IMAGE_REL_TYPE = (
- "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"
- )
- SOURCE_DOCUMENT_PART = "/word/document.xml"
- # Match old and new drawing placeholders (requires id/name, allows extra attributes)
- DRAWING_PATTERN = re.compile(
- r'<drawing\b(?=[^>]*\bid="[^"]*")(?=[^>]*\bname="[^"]*")[^>]*/>'
- )
- DRAWING_TAG_PATTERN = re.compile(r"<drawing\b[^>]*/>")
- DRAWING_ATTR_PATTERN = re.compile(r'([a-zA-Z_][\w:.-]*)="([^"]*)"')
- @dataclass
- class DrawingRelationship:
- """Relationship metadata for a single relationship ID."""
- rel_id: str
- target: str
- target_mode: str
- rel_type: str
- part_name: Optional[str] = None
- content_type: Optional[str] = None
- image_format: Optional[str] = None
- @dataclass
- class DrawingExtractionContext:
- """Context used to resolve and export drawing images for one DOCX file."""
- docx_path: Path
- blocks_output_path: Optional[Path] = None
- export_dir_name: Optional[str] = None
- export_dir_path: Optional[Path] = None
- relationships: Dict[str, DrawingRelationship] = field(default_factory=dict)
- _exported_part_to_relpath: Dict[str, str] = field(default_factory=dict)
- _used_filenames: Dict[str, str] = field(default_factory=dict)
- def resolve_relationship(self, rel_id: str) -> Optional[DrawingRelationship]:
- return self.relationships.get(rel_id)
- def export_embedded_image(self, rel: DrawingRelationship) -> Optional[str]:
- """
- Export an embedded image relationship target to export_dir.
- Returns:
- Relative path like "<blocks_stem>.image/image1.png" if exported,
- or None when export is not applicable.
- """
- if not self.export_dir_path or not self.export_dir_name:
- return None
- if rel.target_mode.lower() == "external":
- return None
- if not rel.part_name:
- return None
- if rel.part_name in self._exported_part_to_relpath:
- return self._exported_part_to_relpath[rel.part_name]
- zip_member = rel.part_name.lstrip("/")
- try:
- with zipfile.ZipFile(self.docx_path, "r") as zf:
- blob = zf.read(zip_member)
- except Exception:
- return None
- filename = self._dedupe_filename(PurePosixPath(rel.part_name).name or "image")
- output_file = self.export_dir_path / filename
- output_file.write_bytes(blob)
- rel_path = str(PurePosixPath(self.export_dir_name) / filename)
- self._exported_part_to_relpath[rel.part_name] = rel_path
- return rel_path
- def _dedupe_filename(self, base_name: str) -> str:
- if base_name not in self._used_filenames:
- self._used_filenames[base_name] = base_name
- return base_name
- stem = Path(base_name).stem
- suffix = Path(base_name).suffix
- index = 2
- while True:
- candidate = f"{stem}_{index}{suffix}"
- if candidate not in self._used_filenames:
- self._used_filenames[candidate] = candidate
- return candidate
- index += 1
- def _normalize_image_format(ext_or_type: str) -> Optional[str]:
- if not ext_or_type:
- return None
- value = ext_or_type.strip().lower()
- # Content-Type
- if value.startswith("image/"):
- value = value.split("/", 1)[1]
- if "+" in value:
- value = value.split("+", 1)[0]
- if value.startswith("x-"):
- value = value[2:]
- # Extension (with or without leading dot)
- value = value.lstrip(".")
- if value == "jpg":
- return "jpeg"
- if value in {"jpeg", "png", "gif", "bmp", "tiff", "webp", "svg", "emf", "wmf"}:
- return value
- return value or None
- def _infer_format_from_target(target: str) -> Optional[str]:
- if not target:
- return None
- parsed = urlparse(target)
- path = parsed.path if parsed.scheme else target
- suffix = PurePosixPath(path).suffix
- return _normalize_image_format(suffix)
- def _resolve_part_name(source_part_name: str, target: str) -> str:
- if target.startswith("/"):
- return posixpath.normpath(target)
- source_dir = posixpath.dirname(source_part_name)
- joined = posixpath.join(source_dir, target)
- normalized = posixpath.normpath(joined)
- if not normalized.startswith("/"):
- normalized = "/" + normalized
- return normalized
- def create_drawing_context(
- docx_path: str,
- blocks_output_path: Optional[str] = None,
- ) -> DrawingExtractionContext:
- """
- Create extraction context for a DOCX file.
- If blocks_output_path is provided, this also prepares `<blocks_stem>.image/`
- beside the blocks file and clears any previous content.
- """
- docx_file = Path(docx_path)
- ctx = DrawingExtractionContext(docx_path=docx_file)
- if blocks_output_path:
- output_path = Path(blocks_output_path)
- export_dir_name = f"{output_path.stem}.image"
- export_dir_path = output_path.parent / export_dir_name
- if export_dir_path.exists():
- shutil.rmtree(export_dir_path)
- export_dir_path.mkdir(parents=True, exist_ok=True)
- ctx.blocks_output_path = output_path
- ctx.export_dir_name = export_dir_name
- ctx.export_dir_path = export_dir_path
- load_relationships(ctx)
- return ctx
- def load_relationships(ctx: DrawingExtractionContext) -> None:
- rels_xml = "word/_rels/document.xml.rels"
- content_types_xml = "[Content_Types].xml"
- overrides: Dict[str, str] = {}
- defaults: Dict[str, str] = {}
- try:
- with zipfile.ZipFile(ctx.docx_path, "r") as zf:
- if content_types_xml in zf.namelist():
- ct_root = ET.parse(zf.open(content_types_xml)).getroot()
- for node in ct_root.findall(f".//{{{CONTENT_TYPE_NS}}}Override"):
- part_name = node.get("PartName")
- content_type = node.get("ContentType")
- if part_name and content_type:
- overrides[part_name] = content_type
- for node in ct_root.findall(f".//{{{CONTENT_TYPE_NS}}}Default"):
- ext = node.get("Extension")
- content_type = node.get("ContentType")
- if ext and content_type:
- defaults[ext.lower()] = content_type
- if rels_xml not in zf.namelist():
- return
- rels_root = ET.parse(zf.open(rels_xml)).getroot()
- except Exception:
- return
- for rel in rels_root.findall(f".//{{{REL_NS}}}Relationship"):
- rel_id = rel.get("Id")
- target = rel.get("Target", "")
- target_mode = rel.get("TargetMode", "")
- rel_type = rel.get("Type", "")
- if not rel_id:
- continue
- part_name = None
- content_type = None
- image_format = None
- if target_mode.lower() != "external":
- part_name = _resolve_part_name(SOURCE_DOCUMENT_PART, target)
- if part_name:
- content_type = overrides.get(part_name)
- if not content_type:
- ext = PurePosixPath(part_name).suffix.lower().lstrip(".")
- content_type = defaults.get(ext)
- image_format = _normalize_image_format(
- content_type or _infer_format_from_target(part_name)
- )
- else:
- image_format = _normalize_image_format(_infer_format_from_target(target))
- ctx.relationships[rel_id] = DrawingRelationship(
- rel_id=rel_id,
- target=target,
- target_mode=target_mode,
- rel_type=rel_type,
- part_name=part_name,
- content_type=content_type,
- image_format=image_format,
- )
- def _extract_blip_relationship(drawing_elem) -> Optional[Tuple[str, str]]:
- for blip in drawing_elem.findall(".//a:blip", NS):
- # Prefer explicit external links when both link/embed are present on one blip.
- # Word may keep an embedded cache for linked pictures.
- rel_link = blip.get(f"{{{NS['r']}}}link")
- if rel_link:
- return "link", rel_link
- rel_embed = blip.get(f"{{{NS['r']}}}embed")
- if rel_embed:
- return "embed", rel_embed
- return None
- def _extract_imagedata_relationship(container_elem) -> Optional[str]:
- """Find an image relationship id from a w:pict / w:object via v:imagedata.
- These legacy VML containers are how Word references EMF/WMF metafiles
- (and the rendered preview of any embedded OLE object). v:imagedata uses
- ``r:id`` to point at the image part for both embedded and externally
- linked images — the relationship's ``TargetMode`` is what disambiguates
- the two cases, so the caller must inspect the resolved relationship.
- """
- r_id_attr = f"{{{NS['r']}}}id"
- for imgdata in container_elem.findall(".//v:imagedata", NS):
- rel_id = imgdata.get(r_id_attr)
- if rel_id:
- return rel_id
- return None
- def _build_placeholder(attrs: Dict[str, str]) -> str:
- ordered_keys = ["id", "name", "path", "format"]
- pieces = []
- for key in ordered_keys:
- if key in attrs and attrs[key] is not None:
- pieces.append(f'{key}="{escape(str(attrs[key]), quote=True)}"')
- # Preserve extra attributes deterministically (sorted by name)
- for key in sorted(k for k in attrs.keys() if k not in ordered_keys):
- value = attrs[key]
- if value is not None:
- pieces.append(f'{key}="{escape(str(value), quote=True)}"')
- return f"<drawing {' '.join(pieces)} />"
- def extract_drawing_placeholder_from_element(
- drawing_elem,
- context: Optional[DrawingExtractionContext] = None,
- include_extended_attrs: bool = True,
- ) -> str:
- """
- Build a <drawing ... /> placeholder from a w:drawing element.
- Behavior:
- - Always emits id/name from wp:docPr when present.
- - For embedded images (a:blip@r:embed): exports image and sets path/format.
- - For linked images (a:blip@r:link): does not download; path is original link target.
- - When no image reference exists (e.g. chart drawing): keeps id/name only.
- """
- doc_pr = drawing_elem.find(".//wp:docPr", NS)
- attrs = {
- "id": doc_pr.get("id", "") if doc_pr is not None else "",
- "name": doc_pr.get("name", "") if doc_pr is not None else "",
- }
- if include_extended_attrs:
- rel_ref = _extract_blip_relationship(drawing_elem)
- if rel_ref is not None and context is not None:
- rel_kind, rel_id = rel_ref
- rel = context.resolve_relationship(rel_id)
- if rel is not None:
- if rel_kind == "embed" and rel.rel_type == IMAGE_REL_TYPE:
- rel_path = context.export_embedded_image(rel)
- if rel_path:
- attrs["path"] = rel_path
- if rel.image_format:
- attrs["format"] = rel.image_format
- elif rel_kind == "link":
- if rel.target:
- attrs["path"] = rel.target
- if rel.image_format:
- attrs["format"] = rel.image_format
- return _build_placeholder(attrs)
- def extract_vml_image_placeholder_from_element(
- container_elem,
- context: Optional[DrawingExtractionContext] = None,
- include_extended_attrs: bool = True,
- ) -> str:
- """
- Build a <drawing ... /> placeholder from a w:pict or w:object element.
- Legacy Word documents and OLE-embedded objects (Visio diagrams, equation
- editor previews, etc.) expose their rendered image via VML rather than
- DrawingML. The image is referenced through ``<v:imagedata r:id="..."/>``
- inside ``<v:shape>``, and the underlying bytes are commonly EMF/WMF
- metafiles. This function exports those bytes through the same context as
- DrawingML images so EMF/WMF assets land in the blocks.assets directory
- alongside PNG/JPEG ones.
- The output placeholder format matches
- ``extract_drawing_placeholder_from_element`` so downstream consumers
- treat both paths uniformly.
- """
- shape = container_elem.find(".//v:shape", NS)
- attrs = {
- "id": shape.get("id", "") if shape is not None else "",
- "name": shape.get("alt", "") if shape is not None else "",
- }
- if include_extended_attrs:
- rel_id = _extract_imagedata_relationship(container_elem)
- if rel_id and context is not None:
- rel = context.resolve_relationship(rel_id)
- if rel is not None and rel.rel_type == IMAGE_REL_TYPE:
- # VML reuses r:id for both embedded image parts and externally
- # linked images; only the resolved TargetMode tells us which.
- # Treating an external relationship as embedded would call
- # export_embedded_image() (which short-circuits on external)
- # and silently drop the linked path.
- if rel.target_mode.lower() == "external":
- if rel.target:
- attrs["path"] = rel.target
- if rel.image_format:
- attrs["format"] = rel.image_format
- else:
- rel_path = context.export_embedded_image(rel)
- if rel_path:
- attrs["path"] = rel_path
- if rel.image_format:
- attrs["format"] = rel.image_format
- return _build_placeholder(attrs)
- def parse_drawing_attributes(placeholder: str) -> Dict[str, str]:
- """Parse attributes from a <drawing ... /> placeholder."""
- return {
- name: unescape(value)
- for name, value in DRAWING_ATTR_PATTERN.findall(placeholder)
- }
- def normalize_drawing_placeholder(
- placeholder: str,
- include_extended_attrs: bool = False,
- ) -> str:
- """
- Normalize one drawing placeholder into canonical attribute order.
- Args:
- placeholder: Input placeholder string
- include_extended_attrs: If False, keeps only id/name.
- """
- attrs = parse_drawing_attributes(placeholder)
- normalized = {
- "id": attrs.get("id", ""),
- "name": attrs.get("name", ""),
- }
- if include_extended_attrs:
- if "path" in attrs:
- normalized["path"] = attrs["path"]
- if "format" in attrs:
- normalized["format"] = attrs["format"]
- for key, value in attrs.items():
- if key not in {"id", "name", "path", "format"}:
- normalized[key] = value
- return _build_placeholder(normalized)
- def normalize_drawing_placeholders_in_text(
- text: str,
- include_extended_attrs: bool = False,
- ) -> str:
- """Normalize all drawing placeholders inside a text blob."""
- if not text:
- return text
- def _replace(match: re.Match) -> str:
- return normalize_drawing_placeholder(
- match.group(0),
- include_extended_attrs=include_extended_attrs,
- )
- return DRAWING_TAG_PATTERN.sub(_replace, text)
|