_zip.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. """Shared zip-bundle extraction for external parser engines.
  2. Engines like docling return their full output as a zip archive. This helper
  3. extracts it safely (refusing path traversal / absolute paths) into a target
  4. directory. Engine-specific post-extraction normalization (e.g. mineru's
  5. nested-subdir hoist) is *not* done here — each engine's client handles its
  6. own quirks.
  7. """
  8. from __future__ import annotations
  9. import io
  10. import os
  11. import zipfile
  12. from pathlib import Path
  13. def safe_extract_zip(payload: bytes, dest_dir: Path) -> list[str]:
  14. """Extract a zip archive into ``dest_dir``, refusing unsafe paths.
  15. Raises ``RuntimeError`` if any entry name is absolute or contains ``..``
  16. components after normalization. Returns the list of extracted member
  17. names (as stored in the zip, prior to OS-specific normalization), so
  18. callers can validate the bundle layout without re-walking the directory.
  19. """
  20. dest_dir.mkdir(parents=True, exist_ok=True)
  21. buf = io.BytesIO(payload)
  22. with zipfile.ZipFile(buf) as zf:
  23. names = zf.namelist()
  24. for name in names:
  25. norm = os.path.normpath(name)
  26. if (
  27. norm.startswith("..")
  28. or os.path.isabs(norm)
  29. or norm.startswith(("/", os.sep))
  30. ):
  31. raise RuntimeError(f"Refusing zip entry with unsafe path: {name!r}")
  32. zf.extractall(dest_dir)
  33. return names
  34. __all__ = ["safe_extract_zip"]