wxcz_admin
/
lightrag-cn-git-d5efd3


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
							#!/usr/bin/env python3
"""
ABOUTME: Shared drawing/image extraction utilities for DOCX parsing and editing
ABOUTME: Resolves w:drawing -> a:blip relationships, exports embedded images, builds placeholders
"""

from __future__ import annotations

import posixpath
import re
import shutil
import zipfile
from dataclasses import dataclass, field
from html import escape, unescape
from pathlib import Path, PurePosixPath
from typing import Dict, Optional, Tuple
from urllib.parse import urlparse

try:
    from defusedxml import ElementTree as ET
except ImportError:  # pragma: no cover
    from xml.etree import ElementTree as ET


NS = {
    "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
    "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
    "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
    "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
    "v": "urn:schemas-microsoft-com:vml",
}

REL_NS = "http://schemas.openxmlformats.org/package/2006/relationships"
CONTENT_TYPE_NS = "http://schemas.openxmlformats.org/package/2006/content-types"
IMAGE_REL_TYPE = (
    "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"
)
SOURCE_DOCUMENT_PART = "/word/document.xml"

# Match old and new drawing placeholders (requires id/name, allows extra attributes)
DRAWING_PATTERN = re.compile(
    r'<drawing\b(?=[^>]*\bid="[^"]*")(?=[^>]*\bname="[^"]*")[^>]*/>'
)
DRAWING_TAG_PATTERN = re.compile(r"<drawing\b[^>]*/>")
DRAWING_ATTR_PATTERN = re.compile(r'([a-zA-Z_][\w:.-]*)="([^"]*)"')


@dataclass
class DrawingRelationship:
    """Relationship metadata for a single relationship ID."""

    rel_id: str
    target: str
    target_mode: str
    rel_type: str
    part_name: Optional[str] = None
    content_type: Optional[str] = None
    image_format: Optional[str] = None


@dataclass
class DrawingExtractionContext:
    """Context used to resolve and export drawing images for one DOCX file."""

    docx_path: Path
    blocks_output_path: Optional[Path] = None
    export_dir_name: Optional[str] = None
    export_dir_path: Optional[Path] = None
    relationships: Dict[str, DrawingRelationship] = field(default_factory=dict)
    _exported_part_to_relpath: Dict[str, str] = field(default_factory=dict)
    _used_filenames: Dict[str, str] = field(default_factory=dict)

    def resolve_relationship(self, rel_id: str) -> Optional[DrawingRelationship]:
        return self.relationships.get(rel_id)

    def export_embedded_image(self, rel: DrawingRelationship) -> Optional[str]:
        """
        Export an embedded image relationship target to export_dir.

        Returns:
            Relative path like "<blocks_stem>.image/image1.png" if exported,
            or None when export is not applicable.
        """
        if not self.export_dir_path or not self.export_dir_name:
            return None
        if rel.target_mode.lower() == "external":
            return None
        if not rel.part_name:
            return None
        if rel.part_name in self._exported_part_to_relpath:
            return self._exported_part_to_relpath[rel.part_name]

        zip_member = rel.part_name.lstrip("/")
        try:
            with zipfile.ZipFile(self.docx_path, "r") as zf:
                blob = zf.read(zip_member)
        except Exception:
            return None

        filename = self._dedupe_filename(PurePosixPath(rel.part_name).name or "image")
        output_file = self.export_dir_path / filename
        output_file.write_bytes(blob)

        rel_path = str(PurePosixPath(self.export_dir_name) / filename)
        self._exported_part_to_relpath[rel.part_name] = rel_path
        return rel_path

    def _dedupe_filename(self, base_name: str) -> str:
        if base_name not in self._used_filenames:
            self._used_filenames[base_name] = base_name
            return base_name

        stem = Path(base_name).stem
        suffix = Path(base_name).suffix
        index = 2
        while True:
            candidate = f"{stem}_{index}{suffix}"
            if candidate not in self._used_filenames:
                self._used_filenames[candidate] = candidate
                return candidate
            index += 1


def _normalize_image_format(ext_or_type: str) -> Optional[str]:
    if not ext_or_type:
        return None
    value = ext_or_type.strip().lower()

    # Content-Type
    if value.startswith("image/"):
        value = value.split("/", 1)[1]
        if "+" in value:
            value = value.split("+", 1)[0]
        if value.startswith("x-"):
            value = value[2:]

    # Extension (with or without leading dot)
    value = value.lstrip(".")
    if value == "jpg":
        return "jpeg"
    if value in {"jpeg", "png", "gif", "bmp", "tiff", "webp", "svg", "emf", "wmf"}:
        return value
    return value or None


def _infer_format_from_target(target: str) -> Optional[str]:
    if not target:
        return None
    parsed = urlparse(target)
    path = parsed.path if parsed.scheme else target
    suffix = PurePosixPath(path).suffix
    return _normalize_image_format(suffix)


def _resolve_part_name(source_part_name: str, target: str) -> str:
    if target.startswith("/"):
        return posixpath.normpath(target)
    source_dir = posixpath.dirname(source_part_name)
    joined = posixpath.join(source_dir, target)
    normalized = posixpath.normpath(joined)
    if not normalized.startswith("/"):
        normalized = "/" + normalized
    return normalized


def create_drawing_context(
    docx_path: str,
    blocks_output_path: Optional[str] = None,
) -> DrawingExtractionContext:
    """
    Create extraction context for a DOCX file.

    If blocks_output_path is provided, this also prepares `<blocks_stem>.image/`
    beside the blocks file and clears any previous content.
    """
    docx_file = Path(docx_path)
    ctx = DrawingExtractionContext(docx_path=docx_file)

    if blocks_output_path:
        output_path = Path(blocks_output_path)
        export_dir_name = f"{output_path.stem}.image"
        export_dir_path = output_path.parent / export_dir_name
        if export_dir_path.exists():
            shutil.rmtree(export_dir_path)
        export_dir_path.mkdir(parents=True, exist_ok=True)
        ctx.blocks_output_path = output_path
        ctx.export_dir_name = export_dir_name
        ctx.export_dir_path = export_dir_path

    load_relationships(ctx)
    return ctx


def load_relationships(ctx: DrawingExtractionContext) -> None:
    rels_xml = "word/_rels/document.xml.rels"
    content_types_xml = "[Content_Types].xml"

    overrides: Dict[str, str] = {}
    defaults: Dict[str, str] = {}

    try:
        with zipfile.ZipFile(ctx.docx_path, "r") as zf:
            if content_types_xml in zf.namelist():
                ct_root = ET.parse(zf.open(content_types_xml)).getroot()
                for node in ct_root.findall(f".//{{{CONTENT_TYPE_NS}}}Override"):
                    part_name = node.get("PartName")
                    content_type = node.get("ContentType")
                    if part_name and content_type:
                        overrides[part_name] = content_type
                for node in ct_root.findall(f".//{{{CONTENT_TYPE_NS}}}Default"):
                    ext = node.get("Extension")
                    content_type = node.get("ContentType")
                    if ext and content_type:
                        defaults[ext.lower()] = content_type

            if rels_xml not in zf.namelist():
                return
            rels_root = ET.parse(zf.open(rels_xml)).getroot()
    except Exception:
        return

    for rel in rels_root.findall(f".//{{{REL_NS}}}Relationship"):
        rel_id = rel.get("Id")
        target = rel.get("Target", "")
        target_mode = rel.get("TargetMode", "")
        rel_type = rel.get("Type", "")
        if not rel_id:
            continue

        part_name = None
        content_type = None
        image_format = None

        if target_mode.lower() != "external":
            part_name = _resolve_part_name(SOURCE_DOCUMENT_PART, target)
            if part_name:
                content_type = overrides.get(part_name)
                if not content_type:
                    ext = PurePosixPath(part_name).suffix.lower().lstrip(".")
                    content_type = defaults.get(ext)
                image_format = _normalize_image_format(
                    content_type or _infer_format_from_target(part_name)
                )
        else:
            image_format = _normalize_image_format(_infer_format_from_target(target))

        ctx.relationships[rel_id] = DrawingRelationship(
            rel_id=rel_id,
            target=target,
            target_mode=target_mode,
            rel_type=rel_type,
            part_name=part_name,
            content_type=content_type,
            image_format=image_format,
        )


def _extract_blip_relationship(drawing_elem) -> Optional[Tuple[str, str]]:
    for blip in drawing_elem.findall(".//a:blip", NS):
        # Prefer explicit external links when both link/embed are present on one blip.
        # Word may keep an embedded cache for linked pictures.
        rel_link = blip.get(f"{{{NS['r']}}}link")
        if rel_link:
            return "link", rel_link
        rel_embed = blip.get(f"{{{NS['r']}}}embed")
        if rel_embed:
            return "embed", rel_embed
    return None


def _extract_imagedata_relationship(container_elem) -> Optional[str]:
    """Find an image relationship id from a w:pict / w:object via v:imagedata.

    These legacy VML containers are how Word references EMF/WMF metafiles
    (and the rendered preview of any embedded OLE object). v:imagedata uses
    ``r:id`` to point at the image part for both embedded and externally
    linked images — the relationship's ``TargetMode`` is what disambiguates
    the two cases, so the caller must inspect the resolved relationship.
    """
    r_id_attr = f"{{{NS['r']}}}id"
    for imgdata in container_elem.findall(".//v:imagedata", NS):
        rel_id = imgdata.get(r_id_attr)
        if rel_id:
            return rel_id
    return None


def _build_placeholder(attrs: Dict[str, str]) -> str:
    ordered_keys = ["id", "name", "path", "format"]
    pieces = []
    for key in ordered_keys:
        if key in attrs and attrs[key] is not None:
            pieces.append(f'{key}="{escape(str(attrs[key]), quote=True)}"')

    # Preserve extra attributes deterministically (sorted by name)
    for key in sorted(k for k in attrs.keys() if k not in ordered_keys):
        value = attrs[key]
        if value is not None:
            pieces.append(f'{key}="{escape(str(value), quote=True)}"')

    return f"<drawing {' '.join(pieces)} />"


def extract_drawing_placeholder_from_element(
    drawing_elem,
    context: Optional[DrawingExtractionContext] = None,
    include_extended_attrs: bool = True,
) -> str:
    """
    Build a <drawing ... /> placeholder from a w:drawing element.

    Behavior:
    - Always emits id/name from wp:docPr when present.
    - For embedded images (a:blip@r:embed): exports image and sets path/format.
    - For linked images (a:blip@r:link): does not download; path is original link target.
    - When no image reference exists (e.g. chart drawing): keeps id/name only.
    """
    doc_pr = drawing_elem.find(".//wp:docPr", NS)
    attrs = {
        "id": doc_pr.get("id", "") if doc_pr is not None else "",
        "name": doc_pr.get("name", "") if doc_pr is not None else "",
    }

    if include_extended_attrs:
        rel_ref = _extract_blip_relationship(drawing_elem)
        if rel_ref is not None and context is not None:
            rel_kind, rel_id = rel_ref
            rel = context.resolve_relationship(rel_id)
            if rel is not None:
                if rel_kind == "embed" and rel.rel_type == IMAGE_REL_TYPE:
                    rel_path = context.export_embedded_image(rel)
                    if rel_path:
                        attrs["path"] = rel_path
                    if rel.image_format:
                        attrs["format"] = rel.image_format
                elif rel_kind == "link":
                    if rel.target:
                        attrs["path"] = rel.target
                    if rel.image_format:
                        attrs["format"] = rel.image_format

    return _build_placeholder(attrs)


def extract_vml_image_placeholder_from_element(
    container_elem,
    context: Optional[DrawingExtractionContext] = None,
    include_extended_attrs: bool = True,
) -> str:
    """
    Build a <drawing ... /> placeholder from a w:pict or w:object element.

    Legacy Word documents and OLE-embedded objects (Visio diagrams, equation
    editor previews, etc.) expose their rendered image via VML rather than
    DrawingML. The image is referenced through ``<v:imagedata r:id="..."/>``
    inside ``<v:shape>``, and the underlying bytes are commonly EMF/WMF
    metafiles. This function exports those bytes through the same context as
    DrawingML images so EMF/WMF assets land in the blocks.assets directory
    alongside PNG/JPEG ones.

    The output placeholder format matches
    ``extract_drawing_placeholder_from_element`` so downstream consumers
    treat both paths uniformly.
    """
    shape = container_elem.find(".//v:shape", NS)
    attrs = {
        "id": shape.get("id", "") if shape is not None else "",
        "name": shape.get("alt", "") if shape is not None else "",
    }

    if include_extended_attrs:
        rel_id = _extract_imagedata_relationship(container_elem)
        if rel_id and context is not None:
            rel = context.resolve_relationship(rel_id)
            if rel is not None and rel.rel_type == IMAGE_REL_TYPE:
                # VML reuses r:id for both embedded image parts and externally
                # linked images; only the resolved TargetMode tells us which.
                # Treating an external relationship as embedded would call
                # export_embedded_image() (which short-circuits on external)
                # and silently drop the linked path.
                if rel.target_mode.lower() == "external":
                    if rel.target:
                        attrs["path"] = rel.target
                    if rel.image_format:
                        attrs["format"] = rel.image_format
                else:
                    rel_path = context.export_embedded_image(rel)
                    if rel_path:
                        attrs["path"] = rel_path
                    if rel.image_format:
                        attrs["format"] = rel.image_format

    return _build_placeholder(attrs)


def parse_drawing_attributes(placeholder: str) -> Dict[str, str]:
    """Parse attributes from a <drawing ... /> placeholder."""
    return {
        name: unescape(value)
        for name, value in DRAWING_ATTR_PATTERN.findall(placeholder)
    }


def normalize_drawing_placeholder(
    placeholder: str,
    include_extended_attrs: bool = False,
) -> str:
    """
    Normalize one drawing placeholder into canonical attribute order.

    Args:
        placeholder: Input placeholder string
        include_extended_attrs: If False, keeps only id/name.
    """
    attrs = parse_drawing_attributes(placeholder)
    normalized = {
        "id": attrs.get("id", ""),
        "name": attrs.get("name", ""),
    }
    if include_extended_attrs:
        if "path" in attrs:
            normalized["path"] = attrs["path"]
        if "format" in attrs:
            normalized["format"] = attrs["format"]
        for key, value in attrs.items():
            if key not in {"id", "name", "path", "format"}:
                normalized[key] = value
    return _build_placeholder(normalized)


def normalize_drawing_placeholders_in_text(
    text: str,
    include_extended_attrs: bool = False,
) -> str:
    """Normalize all drawing placeholders inside a text blob."""
    if not text:
        return text

    def _replace(match: re.Match) -> str:
        return normalize_drawing_placeholder(
            match.group(0),
            include_extended_attrs=include_extended_attrs,
        )

    return DRAWING_TAG_PATTERN.sub(_replace, text)