multimodal_outputs.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. """
  2. Example demonstrating multimodal tool outputs (image + file) using Agency Swarm.
  3. Flow: tools return images or files -> agent reads them -> responds with a description.
  4. Two BaseTool classes are defined:
  5. 1. ``LoadShowcaseImage`` serves a local image via ``tool_output_image_from_path``.
  6. 2. ``LoadReferenceReport`` returns a remotely hosted PDF via ``tool_output_file_from_url``.
  7. Run with a valid OpenAI API key configured in your environment.
  8. """
  9. import asyncio
  10. import os
  11. import sys
  12. from pathlib import Path
  13. # Allow running the example directly from the repository.
  14. sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src")))
  15. from pydantic import Field
  16. from agency_swarm import Agency, Agent, BaseTool, ToolOutputFileContent, ToolOutputImage
  17. from agency_swarm.tools.utils import tool_output_file_from_url, tool_output_image_from_path
  18. DATA_DIR = Path(__file__).resolve().parent / "data"
  19. REFERENCE_PDF_URL = "https://raw.githubusercontent.com/VRSEN/agency-swarm/main/examples/data/daily_revenue_report.pdf"
  20. class LoadShowcaseImage(BaseTool):
  21. """Return the latest gallery image as a multimodal output."""
  22. path: Path = Field(default=DATA_DIR / "daily_revenue.png", description="Image to publish")
  23. detail: str = Field(default="auto", description="Vision model detail level")
  24. def run(self) -> ToolOutputImage:
  25. return tool_output_image_from_path(self.path, detail=self.detail)
  26. class LoadReferenceReport(BaseTool):
  27. """Return the reference PDF hosted remotely."""
  28. source_url: str = Field(default=REFERENCE_PDF_URL, description="Remote PDF to attach")
  29. def run(self) -> ToolOutputFileContent:
  30. return tool_output_file_from_url(self.source_url)
  31. def create_multimodal_agency() -> Agency:
  32. gallery_agent = Agent(
  33. name="GalleryAgent",
  34. description="Provides gallery outputs with narrative context.",
  35. instructions="Call LoadShowcaseImage when asked for the latest gallery image. "
  36. "Use LoadReferenceReport when a supporting document is requested.",
  37. tools=[LoadShowcaseImage, LoadReferenceReport],
  38. model="gpt-5.4-mini",
  39. )
  40. return Agency(gallery_agent)
  41. async def main() -> None:
  42. agency = create_multimodal_agency()
  43. response = await agency.get_response("Analyze the daily revenue graph, and summarize the supporting report.")
  44. print("Final response:")
  45. print(response.final_output)
  46. if __name__ == "__main__":
  47. asyncio.run(main())