test_responses_api_tools.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. import asyncio
  2. import logging
  3. import os
  4. import tempfile
  5. from pathlib import Path
  6. import pytest
  7. from agents import (
  8. Agent,
  9. ModelSettings,
  10. Runner,
  11. function_tool,
  12. )
  13. from agents.models.openai_responses import OpenAIResponsesModel
  14. from openai import AsyncOpenAI
  15. from pydantic import BaseModel
  16. from agency_swarm import Agency, Agent as AgencySwarmAgent
  17. # Configure logging to see SDK and HTTP client details
  18. logging.basicConfig(level=logging.DEBUG)
  19. logger = logging.getLogger(__name__)
  20. # Ensure API key is available
  21. OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
  22. class SimpleToolParams(BaseModel):
  23. input_string: str
  24. class SimpleToolOutput(BaseModel):
  25. processed_string: str
  26. @function_tool
  27. def simple_processor_tool(params: SimpleToolParams) -> SimpleToolOutput:
  28. logger.debug(f"simple_processor_tool called with: {params.input_string}")
  29. return SimpleToolOutput(processed_string=f"Processed: {params.input_string}")
  30. class CalculatorToolParams(BaseModel):
  31. a: float
  32. b: float
  33. operation: str # "add", "subtract", "multiply", "divide"
  34. class CalculatorToolOutput(BaseModel):
  35. result: float
  36. calculation: str
  37. @function_tool
  38. def calculator_tool(params: CalculatorToolParams) -> CalculatorToolOutput:
  39. """A calculator tool that performs basic arithmetic operations."""
  40. logger.debug(f"calculator_tool called with: {params.a} {params.operation} {params.b}")
  41. if params.operation == "add":
  42. result = params.a + params.b
  43. elif params.operation == "subtract":
  44. result = params.a - params.b
  45. elif params.operation == "multiply":
  46. result = params.a * params.b
  47. elif params.operation == "divide":
  48. if params.b == 0:
  49. raise ValueError("Cannot divide by zero")
  50. result = params.a / params.b
  51. else:
  52. raise ValueError(f"Unsupported operation: {params.operation}")
  53. calculation = f"{params.a} {params.operation} {params.b} = {result}"
  54. return CalculatorToolOutput(result=result, calculation=calculation)
  55. @pytest.mark.asyncio
  56. async def test_tool_cycle_with_sdk_and_responses_api():
  57. """
  58. Integration test verifying that the openai-agents SDK properly handles tool cycles
  59. with the OpenAI Responses API.
  60. This test ensures that:
  61. 1. Tools can be called successfully using the SDK
  62. 2. Tool outputs are processed correctly
  63. 3. The agent can provide a final response incorporating tool results
  64. 4. The SDK's tool use behavior works as expected with the Responses API
  65. """
  66. # Explicitly create an AsyncOpenAI client
  67. client = AsyncOpenAI(api_key=OPENAI_API_KEY)
  68. forced_responses_model = OpenAIResponsesModel(model="gpt-5.4-mini", openai_client=client)
  69. agent = Agent(
  70. name="SDK Responses API Test Agent",
  71. instructions="You are an agent that uses tools. When asked to process text, use the simple_processor_tool.",
  72. tools=[simple_processor_tool],
  73. tool_use_behavior="run_llm_again", # Send tool output back to LLM for final response
  74. model=forced_responses_model,
  75. )
  76. logger.info("Testing tool cycle with SDK Agent using OpenAIResponsesModel")
  77. # Test that the agent can successfully use tools and provide a response
  78. result = await Runner.run(agent, input="Please process the text 'hello world' using your tool.")
  79. # Verify the run completed successfully
  80. assert result is not None, "Runner.run should return a result"
  81. assert result.final_output is not None, "Result should have a final output"
  82. logger.info(f"Final output: {result.final_output}")
  83. logger.info(f"Number of new items: {len(result.new_items) if result.new_items else 0}")
  84. # Verify that the tool was actually called and the output was processed
  85. final_output_str = str(result.final_output).lower()
  86. # The tool should have processed "hello world" to "Processed: hello world"
  87. assert "processed" in final_output_str, f"Tool output should be processed. Got: {result.final_output}"
  88. assert "hello world" in final_output_str, f"Original input should be referenced. Got: {result.final_output}"
  89. # Verify that we have the expected items in the result
  90. assert result.new_items is not None and len(result.new_items) > 0, "Should have new items from the run"
  91. # Debug: Print the actual items to understand the structure
  92. logger.info("Actual items returned:")
  93. for i, item in enumerate(result.new_items):
  94. logger.info(f" Item {i + 1}: {type(item).__name__} - {item}")
  95. if hasattr(item, "raw_item"):
  96. logger.info(f" Raw item type: {type(item.raw_item)}")
  97. # Check that we have meaningful output from the tool
  98. # The agent should have used the tool and incorporated the result
  99. assert "processed" in final_output_str, f"Tool should have been used to process text. Got: {result.final_output}"
  100. # Verify the tool was actually executed by checking for tool-related items
  101. # Look for any tool-related items (calls or outputs)
  102. tool_related_items = [
  103. item
  104. for item in result.new_items
  105. if hasattr(item, "raw_item")
  106. and ("function" in str(type(item.raw_item)).lower() or "tool" in str(type(item.raw_item)).lower())
  107. ]
  108. logger.info(f"Found {len(tool_related_items)} tool-related items")
  109. # The test passes if the tool was used (evidenced by the output) and we got a response
  110. # The exact structure of items may vary by SDK version, but the functionality should work
  111. assert len(result.new_items) > 0, "Should have generated some items during execution"
  112. logger.info("✅ SDK tool cycle with Responses API working correctly")
  113. @pytest.mark.asyncio
  114. async def test_tool_output_conversion_bug_two_turn_conversation():
  115. """
  116. Integration test verifying that ToolCallOutputItem is correctly converted in Agency Swarm.
  117. This test ensures that tool outputs are properly formatted in conversation history
  118. for multi-turn conversations, allowing agents to reference previous tool results.
  119. Test scenario:
  120. 1. First turn: Agent uses calculator tool to perform a calculation
  121. 2. Second turn: Ask agent to reference the previous calculation result
  122. This verifies that tool outputs are preserved correctly and accessible in subsequent turns.
  123. """
  124. # Create Agency Swarm agent with calculator tool
  125. agent = AgencySwarmAgent(
  126. name="Calculator Agent",
  127. instructions="You are a calculator assistant. Use the calculator tool for arithmetic operations.",
  128. model="gpt-5.4-mini",
  129. )
  130. # Add the calculator tool to the agent
  131. agent.add_tool(calculator_tool)
  132. # Create an agency with the agent
  133. agency = Agency(agent)
  134. # TURN 1: Ask agent to perform a calculation
  135. logger.info("=== TURN 1: Performing calculation ===")
  136. result1 = await agency.get_response(message="Please calculate 15 + 27 using the calculator tool.")
  137. # Verify the first turn completed successfully
  138. assert result1 is not None
  139. logger.info(f"Turn 1 result: {result1.final_output}")
  140. # Get the conversation history from the agency's thread manager
  141. history_after_turn1 = agency.thread_manager._store.messages
  142. logger.info(f"=== CONVERSATION HISTORY AFTER TURN 1 ({len(history_after_turn1)} items) ===")
  143. for i, item in enumerate(history_after_turn1):
  144. logger.info(f"Item {i + 1}: {item}")
  145. # TURN 2: Ask agent to reference the previous calculation
  146. logger.info("=== TURN 2: Referencing previous calculation ===")
  147. result2 = await agency.get_response(
  148. message="What was the result of the calculation you just performed? Please tell me the exact result."
  149. )
  150. # Verify the second turn completed successfully
  151. assert result2 is not None
  152. logger.info(f"Turn 2 result: {result2.final_output}")
  153. # Get the final conversation history from the agency's thread manager
  154. history_after_turn2 = agency.thread_manager._store.messages
  155. logger.info(f"=== FINAL CONVERSATION HISTORY ({len(history_after_turn2)} items) ===")
  156. for i, item in enumerate(history_after_turn2):
  157. logger.info(f"Item {i + 1}: {item}")
  158. # TEST ASSERTIONS
  159. # 1. Verify that tool outputs in conversation history have correct format
  160. tool_output_items = [
  161. item for item in history_after_turn2 if isinstance(item, dict) and item.get("type") == "function_call_output"
  162. ]
  163. logger.info(f"Found {len(tool_output_items)} tool output items in conversation history")
  164. # 2. Check if any tool outputs were incorrectly converted to assistant messages
  165. incorrect_assistant_messages = [
  166. item
  167. for item in history_after_turn2
  168. if (
  169. isinstance(item, dict)
  170. and item.get("role") == "assistant"
  171. and isinstance(item.get("content"), str)
  172. and "Tool output for call" in item.get("content", "")
  173. )
  174. ]
  175. logger.info(f"Found {len(incorrect_assistant_messages)} incorrectly converted tool outputs")
  176. # 3. Verify no incorrect conversions occurred
  177. if incorrect_assistant_messages:
  178. logger.error("BUG DETECTED: Tool outputs incorrectly converted to assistant messages:")
  179. for msg in incorrect_assistant_messages:
  180. logger.error(f" {msg}")
  181. assert len(incorrect_assistant_messages) == 0, (
  182. f"Found {len(incorrect_assistant_messages)} incorrectly converted tool outputs. "
  183. "ToolCallOutputItem should use SDK's to_input_item() method, not convert to assistant messages."
  184. )
  185. # 4. Verify that the agent can correctly reference previous tool outputs
  186. # The second response should mention the calculation result (42)
  187. final_response = str(result2.final_output).lower()
  188. assert "42" in final_response, (
  189. f"Agent should be able to reference the previous calculation result (42). Got response: {result2.final_output}"
  190. )
  191. logger.info("✅ Tool output conversion working correctly - no conversion bugs detected")
  192. @pytest.mark.asyncio
  193. @pytest.mark.skipif(
  194. os.getenv("CI") == "true",
  195. reason="Requires live OpenAI API; skipped on CI to avoid upstream flake.",
  196. )
  197. async def test_hosted_tool_output_preservation_multi_turn():
  198. """
  199. Integration test for hosted tool output preservation in multi-turn conversations.
  200. This test verifies that hosted tools (FileSearch, WebSearch) results are properly
  201. preserved in conversation history for future reference.
  202. Test scenario:
  203. 1. First turn: Agent uses FileSearch tool but doesn't reveal specific details
  204. 2. Second turn: Ask agent to provide exact tool output from previous search
  205. This ensures hosted tool results are preserved and accessible in subsequent turns,
  206. solving the bug where they were previously lost between conversations.
  207. """
  208. # Create test data with specific content for numeric validation
  209. with tempfile.TemporaryDirectory(prefix="hosted_tool_test_") as temp_dir_str:
  210. temp_dir = Path(temp_dir_str)
  211. test_file = temp_dir / "company_data.txt"
  212. test_file.write_text("""
  213. COMPANY FINANCIAL REPORT
  214. Revenue Information:
  215. - Q4 Revenue: $7,892,345.67
  216. - Q3 Revenue: $6,234,567.89
  217. - Operating Costs: $2,345,678.90
  218. - Net Profit: $4,123,456.78
  219. Employee Data:
  220. - Total Employees: 1,234
  221. - New Hires: 567
  222. - Contractors: 89
  223. Product Sales:
  224. - Product Alpha: 12,345 units
  225. - Product Beta: 6,789 units
  226. - Product Gamma: 2,345 units
  227. """)
  228. # Create Agency Swarm agent with FileSearch via files_folder
  229. agent = AgencySwarmAgent(
  230. name="DataSearchAgent",
  231. instructions=(
  232. "You are a data search assistant. You MUST use the FileSearch tool to find information. "
  233. "Always search files before answering. Be concise in your initial responses."
  234. ),
  235. model="gpt-5.4-mini",
  236. model_settings=ModelSettings(tool_choice="file_search"),
  237. files_folder=str(temp_dir),
  238. include_search_results=True,
  239. )
  240. # Create an agency with the agent
  241. agency = Agency(agent)
  242. # Wait for file processing and vector store indexing (active polling for stability)
  243. client = AsyncOpenAI(api_key=OPENAI_API_KEY)
  244. vs_id = getattr(agent, "_associated_vector_store_id", None)
  245. if vs_id:
  246. for _ in range(60): # up to 60 seconds
  247. vs = await client.vector_stores.retrieve(vs_id)
  248. if getattr(vs, "status", "") == "completed":
  249. break
  250. if getattr(vs, "status", "") == "failed":
  251. raise RuntimeError(f"Vector store processing failed: {vs}")
  252. await asyncio.sleep(1)
  253. else:
  254. # fallback to a short delay if no id is exposed
  255. await asyncio.sleep(5)
  256. # TURN 1: Agent searches but gives summary only
  257. logger.info("=== TURN 1: Agent searches with FileSearch ===")
  258. from agents import RunConfig
  259. result1 = await agency.get_response(
  260. message=(
  261. "Use FileSearch to search the company data for financial information and employee data. "
  262. "Just confirm you found it, don't give me the specific numbers yet."
  263. ),
  264. run_config=RunConfig(model_settings=ModelSettings(tool_choice="file_search")),
  265. )
  266. assert result1 is not None
  267. logger.info(f"Turn 1 result: {result1.final_output}")
  268. # Get the conversation history from the agency's thread manager
  269. history_after_turn1 = agency.thread_manager._store.messages
  270. logger.info(f"=== CONVERSATION HISTORY AFTER TURN 1 ({len(history_after_turn1)} items) ===")
  271. hosted_tool_outputs_found = 0
  272. preservation_items = []
  273. for i, item in enumerate(history_after_turn1):
  274. item_type = item.get("type", f"role={item.get('role')}")
  275. logger.info(f"Item {i + 1}: {item_type}")
  276. # Look for hosted tool search results messages
  277. if item.get("role") == "system" and "[SEARCH_RESULTS]" in str(item.get("content", "")):
  278. hosted_tool_outputs_found += 1
  279. preservation_items.append(item)
  280. logger.info(f" Found search results message: {str(item.get('content', ''))}...")
  281. logger.info(f"Found {hosted_tool_outputs_found} hosted tool preservation items")
  282. # TURN 2: Ask for exact tool output
  283. logger.info("=== TURN 2: Requesting exact tool output ===")
  284. logger.info(f"History at turn 2: {agency.thread_manager._store.messages}")
  285. result2 = await agency.get_response(
  286. message=(
  287. "Now provide me the exact file search results that you found in the previous tool call. "
  288. "Do not use the tool again. I'm looking for Q3 and Q4 revenue, operating costs, "
  289. "and total employee count."
  290. )
  291. )
  292. assert result2 is not None
  293. logger.info(f"Turn 2 result: {result2.final_output}")
  294. # Verify agent can access specific data from previous tool call
  295. response_text = str(result2.final_output)
  296. # Look for specific numbers that should only come from file search results
  297. has_q4_revenue = "7,892,345.67" in response_text or "7892345.67" in response_text
  298. has_q3_revenue = "6,234,567.89" in response_text or "6234567.89" in response_text
  299. has_operating_costs = "2,345,678.90" in response_text or "2345678.90" in response_text
  300. has_employees = "1,234" in response_text or "1234" in response_text
  301. logger.info(f"Agent can access Q4 revenue (7,892,345.67): {has_q4_revenue}")
  302. logger.info(f"Agent can access Q3 revenue (6,234,567.89): {has_q3_revenue}")
  303. logger.info(f"Agent can access operating costs (2,345,678.90): {has_operating_costs}")
  304. logger.info(f"Agent can access employee count (1,234): {has_employees}")
  305. # TEST ASSERTIONS
  306. # 1. Verify that hosted tool outputs are preserved in conversation history
  307. assert hosted_tool_outputs_found > 0, (
  308. "No hosted tool output preservation found in conversation history. "
  309. "Hosted tool results should be preserved for multi-turn access."
  310. )
  311. # 2. Verify that agent can access specific data from previous hosted tool calls
  312. data_access_score = sum([has_q4_revenue, has_q3_revenue, has_operating_costs, has_employees])
  313. assert data_access_score >= 2, (
  314. f"Agent cannot access specific data from previous hosted tool calls. "
  315. f"Only found {data_access_score}/4 specific data points in response: {response_text}"
  316. )
  317. logger.info("✅ Hosted tool output preservation test completed successfully")