extract-pptx.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. #!/usr/bin/env python3
  2. """
  3. Extract all content from a PowerPoint file (.pptx).
  4. Returns a JSON structure with slides, text, and images.
  5. Usage:
  6. python extract-pptx.py <input.pptx> [output_dir]
  7. Requires: pip install python-pptx
  8. """
  9. import json
  10. import os
  11. import sys
  12. from pptx import Presentation
  13. def extract_pptx(file_path, output_dir="."):
  14. """
  15. Extract all content from a PowerPoint file.
  16. Returns a list of slide data dicts with text, images, and notes.
  17. """
  18. prs = Presentation(file_path)
  19. slides_data = []
  20. # Create assets directory for extracted images
  21. assets_dir = os.path.join(output_dir, "assets")
  22. os.makedirs(assets_dir, exist_ok=True)
  23. for slide_num, slide in enumerate(prs.slides):
  24. slide_data = {
  25. "number": slide_num + 1,
  26. "title": "",
  27. "content": [],
  28. "images": [],
  29. "notes": "",
  30. }
  31. for shape in slide.shapes:
  32. # Extract text content
  33. if shape.has_text_frame:
  34. if shape == slide.shapes.title:
  35. slide_data["title"] = shape.text
  36. else:
  37. slide_data["content"].append(
  38. {"type": "text", "content": shape.text}
  39. )
  40. # Extract images
  41. if shape.shape_type == 13: # Picture type
  42. image = shape.image
  43. image_bytes = image.blob
  44. image_ext = image.ext
  45. image_name = f"slide{slide_num + 1}_img{len(slide_data['images']) + 1}.{image_ext}"
  46. image_path = os.path.join(assets_dir, image_name)
  47. with open(image_path, "wb") as f:
  48. f.write(image_bytes)
  49. slide_data["images"].append(
  50. {
  51. "path": f"assets/{image_name}",
  52. "width": shape.width,
  53. "height": shape.height,
  54. }
  55. )
  56. # Extract speaker notes
  57. if slide.has_notes_slide:
  58. notes_frame = slide.notes_slide.notes_text_frame
  59. slide_data["notes"] = notes_frame.text
  60. slides_data.append(slide_data)
  61. return slides_data
  62. if __name__ == "__main__":
  63. if len(sys.argv) < 2:
  64. print("Usage: python extract-pptx.py <input.pptx> [output_dir]")
  65. sys.exit(1)
  66. input_file = sys.argv[1]
  67. output_dir = sys.argv[2] if len(sys.argv) > 2 else "."
  68. slides = extract_pptx(input_file, output_dir)
  69. # Write extracted data as JSON
  70. output_path = os.path.join(output_dir, "extracted-slides.json")
  71. with open(output_path, "w") as f:
  72. json.dump(slides, f, indent=2)
  73. print(f"Extracted {len(slides)} slides to {output_path}")
  74. for s in slides:
  75. img_count = len(s["images"])
  76. print(f" Slide {s['number']}: {s['title'] or '(no title)'} — {img_count} image(s)")