test_openclaw_response_history_sanitization.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789
  1. from __future__ import annotations
  2. import copy
  3. import json
  4. import time
  5. from collections.abc import AsyncIterator
  6. from typing import Any
  7. import pytest
  8. from agents import Tool
  9. from agents.agent_output import AgentOutputSchemaBase
  10. from agents.handoffs import Handoff
  11. from agents.items import ModelResponse, TResponseInputItem, TResponseStreamEvent
  12. from agents.model_settings import ModelSettings
  13. from agents.models.interface import Model, ModelTracing
  14. from agents.usage import Usage
  15. from openai import OpenAI
  16. from openai.types.responses import (
  17. Response,
  18. ResponseCompletedEvent,
  19. ResponseContentPartAddedEvent,
  20. ResponseContentPartDoneEvent,
  21. ResponseCreatedEvent,
  22. ResponseOutputItemAddedEvent,
  23. ResponseOutputItemDoneEvent,
  24. ResponseOutputMessage,
  25. ResponseOutputText,
  26. ResponseTextDeltaEvent,
  27. )
  28. from openai.types.responses.response_prompt_param import ResponsePromptParam
  29. from openai.types.responses.response_usage import InputTokensDetails, OutputTokensDetails
  30. from agency_swarm import Agency, Agent
  31. from agency_swarm.integrations.fastapi_utils.endpoint_handlers import (
  32. ActiveRunRegistry,
  33. generate_chat_name,
  34. make_response_endpoint,
  35. make_stream_endpoint,
  36. )
  37. from agency_swarm.integrations.fastapi_utils.request_models import BaseRequest
  38. from agency_swarm.messages.response_input_sanitizer import (
  39. REASONING_ENCRYPTED_CONTENT_INCLUDE,
  40. sanitize_store_false_responses_input,
  41. )
  42. class _TrackingResponsesModel(Model):
  43. def __init__(self, model: str = "test-openclaw-threading") -> None:
  44. self.model = model
  45. self.issued_response_ids: list[str] = []
  46. self.seen_previous_response_ids: list[str | None] = []
  47. self.seen_inputs: list[str | list[TResponseInputItem]] = []
  48. self.seen_model_settings: list[ModelSettings] = []
  49. async def get_response(
  50. self,
  51. system_instructions: str | None,
  52. input: str | list[TResponseInputItem],
  53. model_settings: ModelSettings,
  54. tools: list[Tool],
  55. output_schema: AgentOutputSchemaBase | None,
  56. handoffs: list[Handoff],
  57. tracing: ModelTracing,
  58. *,
  59. previous_response_id: str | None,
  60. conversation_id: str | None,
  61. prompt: ResponsePromptParam | None,
  62. ) -> ModelResponse:
  63. self.seen_inputs.append(copy.deepcopy(input) if isinstance(input, list) else input)
  64. self.seen_model_settings.append(copy.deepcopy(model_settings))
  65. self.seen_previous_response_ids.append(previous_response_id)
  66. response_id = self._issue_response_id()
  67. return _build_model_response(text="OK", response_id=response_id)
  68. def stream_response(
  69. self,
  70. system_instructions: str | None,
  71. input: str | list[TResponseInputItem],
  72. model_settings: ModelSettings,
  73. tools: list[Tool],
  74. output_schema: AgentOutputSchemaBase | None,
  75. handoffs: list[Handoff],
  76. tracing: ModelTracing,
  77. *,
  78. previous_response_id: str | None,
  79. conversation_id: str | None,
  80. prompt: ResponsePromptParam | None,
  81. ) -> AsyncIterator[TResponseStreamEvent]:
  82. self.seen_inputs.append(copy.deepcopy(input) if isinstance(input, list) else input)
  83. self.seen_model_settings.append(copy.deepcopy(model_settings))
  84. self.seen_previous_response_ids.append(previous_response_id)
  85. response_id = self._issue_response_id()
  86. return _stream_text_events(text="OK", model_name=self.model, response_id=response_id)
  87. def _issue_response_id(self) -> str:
  88. response_id = f"resp_test_{len(self.issued_response_ids) + 1}"
  89. self.issued_response_ids.append(response_id)
  90. return response_id
  91. def _build_model_response(*, text: str, response_id: str) -> ModelResponse:
  92. message = ResponseOutputMessage(
  93. id=f"msg_{response_id}",
  94. content=[ResponseOutputText(text=text, type="output_text", annotations=[], logprobs=[])],
  95. role="assistant",
  96. status="completed",
  97. type="message",
  98. )
  99. usage = Usage(
  100. requests=1,
  101. input_tokens=0,
  102. output_tokens=1,
  103. total_tokens=1,
  104. input_tokens_details=InputTokensDetails(cached_tokens=0),
  105. output_tokens_details=OutputTokensDetails(reasoning_tokens=0),
  106. )
  107. return ModelResponse(output=[message], usage=usage, response_id=response_id)
  108. async def _stream_text_events(*, text: str, model_name: str, response_id: str) -> AsyncIterator[TResponseStreamEvent]:
  109. created_at = int(time.time())
  110. message_id = f"msg_{response_id}"
  111. completed_message = ResponseOutputMessage(
  112. id=message_id,
  113. content=[ResponseOutputText(text=text, type="output_text", annotations=[], logprobs=[])],
  114. role="assistant",
  115. status="completed",
  116. type="message",
  117. )
  118. yield ResponseCreatedEvent(
  119. response=Response(
  120. id=response_id,
  121. created_at=created_at,
  122. model=model_name,
  123. object="response",
  124. output=[],
  125. tool_choice="none",
  126. tools=[],
  127. parallel_tool_calls=False,
  128. usage=None,
  129. ),
  130. sequence_number=0,
  131. type="response.created",
  132. )
  133. yield ResponseOutputItemAddedEvent(
  134. item=ResponseOutputMessage(
  135. id=message_id,
  136. content=[],
  137. role="assistant",
  138. status="in_progress",
  139. type="message",
  140. ),
  141. output_index=0,
  142. sequence_number=1,
  143. type="response.output_item.added",
  144. )
  145. yield ResponseContentPartAddedEvent(
  146. content_index=0,
  147. item_id=message_id,
  148. output_index=0,
  149. part=ResponseOutputText(text="", type="output_text", annotations=[], logprobs=[]),
  150. sequence_number=2,
  151. type="response.content_part.added",
  152. )
  153. yield ResponseTextDeltaEvent(
  154. content_index=0,
  155. delta=text,
  156. item_id=message_id,
  157. logprobs=[],
  158. output_index=0,
  159. sequence_number=3,
  160. type="response.output_text.delta",
  161. )
  162. yield ResponseContentPartDoneEvent(
  163. content_index=0,
  164. item_id=message_id,
  165. output_index=0,
  166. part=ResponseOutputText(text=text, type="output_text", annotations=[], logprobs=[]),
  167. sequence_number=4,
  168. type="response.content_part.done",
  169. )
  170. yield ResponseOutputItemDoneEvent(
  171. item=completed_message,
  172. output_index=0,
  173. sequence_number=5,
  174. type="response.output_item.done",
  175. )
  176. yield ResponseCompletedEvent(
  177. response=Response(
  178. id=response_id,
  179. created_at=created_at,
  180. model=model_name,
  181. object="response",
  182. output=[completed_message],
  183. tool_choice="none",
  184. tools=[],
  185. parallel_tool_calls=False,
  186. usage=None,
  187. ),
  188. sequence_number=6,
  189. type="response.completed",
  190. )
  191. def _parse_sse_messages_payload(chunks: list[str]) -> dict[str, Any]:
  192. current_event: str | None = None
  193. for chunk in chunks:
  194. for line in chunk.splitlines():
  195. if line.startswith("event: "):
  196. current_event = line.split("event: ", 1)[1].strip()
  197. continue
  198. if current_event == "messages" and line.startswith("data: "):
  199. return json.loads(line.split("data: ", 1)[1])
  200. raise AssertionError("messages payload not found in SSE stream")
  201. class _StubRequest:
  202. async def is_disconnected(self) -> bool:
  203. return False
  204. def _persist_messages(store: list[dict[str, Any]], messages: list[dict[str, Any]]) -> None:
  205. store[:] = copy.deepcopy(messages)
  206. def _agency_factory_with_store(model: _TrackingResponsesModel, store: list[dict[str, Any]]) -> Agency:
  207. agent = Agent(
  208. name="TestAgent",
  209. instructions="Base instructions",
  210. model=model,
  211. model_settings=ModelSettings(temperature=0.0),
  212. )
  213. return Agency(
  214. agent,
  215. load_threads_callback=lambda: copy.deepcopy(store),
  216. save_threads_callback=lambda messages: _persist_messages(store, messages),
  217. )
  218. def _build_agency_factory(model: _TrackingResponsesModel):
  219. def create_agency(load_threads_callback=None, save_threads_callback=None):
  220. agent = Agent(
  221. name="TestAgent",
  222. instructions="Base instructions",
  223. model=model,
  224. model_settings=ModelSettings(temperature=0.0),
  225. )
  226. return Agency(
  227. agent,
  228. load_threads_callback=load_threads_callback,
  229. save_threads_callback=save_threads_callback,
  230. )
  231. return create_agency
  232. def _build_store_false_agency_factory(model: _TrackingResponsesModel):
  233. def create_agency(load_threads_callback=None, save_threads_callback=None):
  234. agent = Agent(
  235. name="TestAgent",
  236. instructions="Base instructions",
  237. model=model,
  238. model_settings=ModelSettings(store=False, temperature=0.0),
  239. )
  240. return Agency(
  241. agent,
  242. load_threads_callback=load_threads_callback,
  243. save_threads_callback=save_threads_callback,
  244. )
  245. return create_agency
  246. def _history_with_encrypted_reasoning() -> list[dict[str, Any]]:
  247. return [
  248. {
  249. "type": "reasoning",
  250. "id": "rs_reasoning_123",
  251. "summary": [{"type": "summary_text", "text": "looked up the answer", "id": "rs_summary_123"}],
  252. "content": [{"type": "reasoning_text", "text": "private", "id": "rs_content_123"}],
  253. "encrypted_content": "encrypted_reasoning",
  254. "previous_response_id": "resp_previous_123",
  255. "status": "completed",
  256. "agent": "TestAgent",
  257. "callerAgent": None,
  258. "timestamp": 1,
  259. },
  260. {
  261. "type": "message",
  262. "role": "assistant",
  263. "id": "msg_answer_123",
  264. "content": [{"type": "output_text", "text": "The answer is 42.", "annotations": [], "id": "msg_text_123"}],
  265. "conversation_id": "conv_previous_123",
  266. "status": "completed",
  267. "agent": "TestAgent",
  268. "callerAgent": None,
  269. "timestamp": 2,
  270. },
  271. {
  272. "type": "function_call",
  273. "id": "fc_lookup_123",
  274. "call_id": "call_lookup_123",
  275. "name": "lookup",
  276. "arguments": "{}",
  277. "status": "completed",
  278. "agent": "TestAgent",
  279. "callerAgent": None,
  280. "timestamp": 3,
  281. },
  282. {
  283. "type": "function_call_output",
  284. "id": "fc_output_123",
  285. "call_id": "call_lookup_123",
  286. "output": "42",
  287. "status": "completed",
  288. "agent": "TestAgent",
  289. "callerAgent": None,
  290. "timestamp": 4,
  291. },
  292. {"type": "item_reference", "id": "msg_answer_123", "agent": "TestAgent", "callerAgent": None, "timestamp": 5},
  293. ]
  294. def _history_with_unencrypted_reasoning() -> list[dict[str, Any]]:
  295. history = _history_with_encrypted_reasoning()
  296. reasoning = next(item for item in history if item.get("type") == "reasoning")
  297. reasoning.pop("encrypted_content")
  298. return history
  299. def _history_with_unencrypted_reasoning_before_tool_pair() -> list[dict[str, Any]]:
  300. return [
  301. {
  302. "type": "reasoning",
  303. "id": "rs_reasoning_123",
  304. "summary": [{"type": "summary_text", "text": "looked up the answer"}],
  305. "status": "completed",
  306. },
  307. {
  308. "type": "function_call",
  309. "id": "fc_lookup_123",
  310. "call_id": "call_lookup_123",
  311. "name": "lookup",
  312. "arguments": "{}",
  313. "status": "completed",
  314. },
  315. {
  316. "type": "function_call_output",
  317. "id": "fc_output_123",
  318. "call_id": "call_lookup_123",
  319. "output": "42",
  320. "status": "completed",
  321. },
  322. {"role": "user", "content": "again"},
  323. ]
  324. def _history_with_unencrypted_reasoning_before_current_user_message() -> list[dict[str, Any]]:
  325. return [
  326. {
  327. "type": "reasoning",
  328. "id": "rs_reasoning_123",
  329. "summary": [{"type": "summary_text", "text": "looked up the answer"}],
  330. "status": "completed",
  331. },
  332. {"role": "user", "content": "again"},
  333. ]
  334. def _history_with_unencrypted_reasoning_before_builtin_tool_call() -> list[dict[str, Any]]:
  335. return [
  336. {
  337. "type": "reasoning",
  338. "id": "rs_reasoning_123",
  339. "summary": [{"type": "summary_text", "text": "searched the web"}],
  340. "status": "completed",
  341. },
  342. {
  343. "type": "web_search_call",
  344. "id": "ws_lookup_123",
  345. "status": "completed",
  346. },
  347. {"role": "user", "content": "again"},
  348. ]
  349. def _history_with_unencrypted_reasoning_before_tool_search_pair() -> list[dict[str, Any]]:
  350. return [
  351. {
  352. "type": "reasoning",
  353. "id": "rs_reasoning_123",
  354. "summary": [{"type": "summary_text", "text": "searched local tools"}],
  355. "status": "completed",
  356. },
  357. {
  358. "type": "tool_search_call",
  359. "id": "ts_lookup_123",
  360. "call_id": "call_lookup_123",
  361. "arguments": {},
  362. "execution": "client",
  363. },
  364. {
  365. "type": "tool_search_output",
  366. "id": "ts_output_123",
  367. "call_id": "call_lookup_123",
  368. "tools": [],
  369. "execution": "client",
  370. },
  371. {"role": "user", "content": "again"},
  372. ]
  373. def _history_with_user_and_legacy_unencrypted_reasoning_turn() -> list[dict[str, Any]]:
  374. return [
  375. {"role": "user", "content": "what is 2+2?"},
  376. {
  377. "type": "reasoning",
  378. "id": "rs_reasoning_123",
  379. "summary": [{"type": "summary_text", "text": "calculated"}],
  380. "status": "completed",
  381. },
  382. {
  383. "type": "message",
  384. "role": "assistant",
  385. "id": "msg_answer_123",
  386. "content": [{"type": "output_text", "text": "4", "annotations": []}],
  387. "status": "completed",
  388. },
  389. {"role": "user", "content": "thanks"},
  390. ]
  391. def _assert_store_false_input_preserves_stateless_reasoning(model_input: str | list[TResponseInputItem]) -> None:
  392. assert isinstance(model_input, list)
  393. reasoning = next(item for item in model_input if isinstance(item, dict) and item.get("type") == "reasoning")
  394. assert reasoning["id"] == "rs_reasoning_123"
  395. assert reasoning["encrypted_content"] == "encrypted_reasoning"
  396. assert "previous_response_id" not in reasoning
  397. assert all("id" not in item for item in reasoning["summary"])
  398. assert all("id" not in item for item in reasoning["content"])
  399. assistant_message = next(item for item in model_input if isinstance(item, dict) and item.get("type") == "message")
  400. assert "conversation_id" not in assistant_message
  401. assert all("id" not in item for item in assistant_message["content"])
  402. function_call = next(item for item in model_input if isinstance(item, dict) and item.get("type") == "function_call")
  403. tool_output = next(
  404. item for item in model_input if isinstance(item, dict) and item.get("type") == "function_call_output"
  405. )
  406. assert function_call["call_id"] == "call_lookup_123"
  407. assert tool_output["call_id"] == "call_lookup_123"
  408. def _assert_unencrypted_reasoning_is_dropped(model_input: str | list[TResponseInputItem]) -> None:
  409. assert isinstance(model_input, list)
  410. assert all(not (isinstance(item, dict) and item.get("type") == "reasoning") for item in model_input)
  411. assert all(not (isinstance(item, dict) and item.get("id") == "msg_answer_123") for item in model_input)
  412. assert model_input == [{"role": "user", "content": "again", "type": "message"}]
  413. def _assert_store_false_requests_encrypted_reasoning(model_settings: ModelSettings) -> None:
  414. assert model_settings.store is False
  415. assert model_settings.response_include is not None
  416. assert REASONING_ENCRYPTED_CONTENT_INCLUDE in model_settings.response_include
  417. def _assert_history_input_has_no_response_ids(model_input: str | list[TResponseInputItem]) -> None:
  418. assert isinstance(model_input, list)
  419. leaked_response_ids = [item for item in model_input if isinstance(item, dict) and "response_id" in item]
  420. assert leaked_response_ids == []
  421. def _assert_messages_have_no_response_ids(messages: list[dict[str, Any]]) -> None:
  422. leaked_response_ids = [item for item in messages if "response_id" in item]
  423. assert leaked_response_ids == []
  424. @pytest.mark.asyncio
  425. async def test_response_endpoint_replays_returned_history_without_hidden_response_ids() -> None:
  426. model = _TrackingResponsesModel()
  427. handler = make_response_endpoint(BaseRequest, _build_agency_factory(model), lambda: None)
  428. first = await handler(BaseRequest(message="hi"), token=None)
  429. history = copy.deepcopy(first["new_messages"])
  430. _assert_messages_have_no_response_ids(history)
  431. await handler(BaseRequest(message="again", chat_history=history), token=None)
  432. assert model.seen_previous_response_ids == [None, None]
  433. _assert_history_input_has_no_response_ids(model.seen_inputs[1])
  434. @pytest.mark.asyncio
  435. async def test_stream_endpoint_replays_returned_history_without_hidden_response_ids() -> None:
  436. model = _TrackingResponsesModel()
  437. handler = make_stream_endpoint(BaseRequest, _build_agency_factory(model), lambda: None, ActiveRunRegistry())
  438. http_request = _StubRequest()
  439. first_response = await handler(http_request=http_request, request=BaseRequest(message="hi"), token=None)
  440. first_chunks = [chunk async for chunk in first_response.body_iterator]
  441. first_payload = _parse_sse_messages_payload(first_chunks)
  442. history = copy.deepcopy(first_payload["new_messages"])
  443. _assert_messages_have_no_response_ids(history)
  444. second_response = await handler(
  445. http_request=http_request,
  446. request=BaseRequest(message="again", chat_history=history),
  447. token=None,
  448. )
  449. _second_chunks = [chunk async for chunk in second_response.body_iterator]
  450. assert model.seen_previous_response_ids == [None, None]
  451. _assert_history_input_has_no_response_ids(model.seen_inputs[1])
  452. @pytest.mark.asyncio
  453. async def test_response_endpoint_store_false_requests_and_preserves_encrypted_reasoning() -> None:
  454. model = _TrackingResponsesModel()
  455. handler = make_response_endpoint(BaseRequest, _build_store_false_agency_factory(model), lambda: None)
  456. await handler(BaseRequest(message="again", chat_history=_history_with_encrypted_reasoning()), token=None)
  457. _assert_store_false_requests_encrypted_reasoning(model.seen_model_settings[0])
  458. _assert_store_false_input_preserves_stateless_reasoning(model.seen_inputs[0])
  459. @pytest.mark.asyncio
  460. async def test_stream_endpoint_store_false_drops_only_unencrypted_reasoning() -> None:
  461. model = _TrackingResponsesModel()
  462. handler = make_stream_endpoint(
  463. BaseRequest,
  464. _build_store_false_agency_factory(model),
  465. lambda: None,
  466. ActiveRunRegistry(),
  467. )
  468. response = await handler(
  469. http_request=_StubRequest(),
  470. request=BaseRequest(message="again", chat_history=_history_with_unencrypted_reasoning()),
  471. token=None,
  472. )
  473. _chunks = [chunk async for chunk in response.body_iterator]
  474. _assert_store_false_requests_encrypted_reasoning(model.seen_model_settings[0])
  475. _assert_unencrypted_reasoning_is_dropped(model.seen_inputs[0])
  476. @pytest.mark.asyncio
  477. async def test_stream_endpoint_store_false_drops_legacy_reasoning_span_and_keeps_current_user() -> None:
  478. model = _TrackingResponsesModel()
  479. handler = make_stream_endpoint(
  480. BaseRequest,
  481. _build_store_false_agency_factory(model),
  482. lambda: None,
  483. ActiveRunRegistry(),
  484. )
  485. legacy_history = _history_with_unencrypted_reasoning_before_tool_pair()[:-1]
  486. response = await handler(
  487. http_request=_StubRequest(),
  488. request=BaseRequest(message="again", chat_history=legacy_history),
  489. token=None,
  490. )
  491. _chunks = [chunk async for chunk in response.body_iterator]
  492. _assert_store_false_requests_encrypted_reasoning(model.seen_model_settings[0])
  493. assert model.seen_inputs[0] == [{"role": "user", "content": "again", "type": "message"}]
  494. def test_store_false_sanitizer_drops_dependent_followers_after_unencrypted_reasoning() -> None:
  495. sanitized = sanitize_store_false_responses_input(_history_with_unencrypted_reasoning_before_tool_pair())
  496. assert sanitized == [{"role": "user", "content": "again"}]
  497. def test_store_false_sanitizer_preserves_current_user_after_unencrypted_reasoning() -> None:
  498. sanitized = sanitize_store_false_responses_input(_history_with_unencrypted_reasoning_before_current_user_message())
  499. assert sanitized == [{"role": "user", "content": "again"}]
  500. def test_store_false_sanitizer_drops_builtin_tool_follower_after_unencrypted_reasoning() -> None:
  501. sanitized = sanitize_store_false_responses_input(_history_with_unencrypted_reasoning_before_builtin_tool_call())
  502. assert sanitized == [{"role": "user", "content": "again"}]
  503. def test_store_false_sanitizer_drops_tool_search_pair_after_unencrypted_reasoning() -> None:
  504. sanitized = sanitize_store_false_responses_input(_history_with_unencrypted_reasoning_before_tool_search_pair())
  505. assert sanitized == [{"role": "user", "content": "again"}]
  506. def test_store_false_sanitizer_drops_full_legacy_reasoning_turn() -> None:
  507. sanitized = sanitize_store_false_responses_input(_history_with_user_and_legacy_unencrypted_reasoning_turn())
  508. assert sanitized == [{"role": "user", "content": "thanks"}]
  509. def test_store_false_sanitizer_drops_late_reference_to_removed_reasoning() -> None:
  510. sanitized = sanitize_store_false_responses_input(
  511. [
  512. {
  513. "type": "reasoning",
  514. "id": "rs_reasoning_123",
  515. "summary": [{"type": "summary_text", "text": "legacy"}],
  516. "status": "completed",
  517. },
  518. {"role": "user", "content": "again"},
  519. {"type": "item_reference", "id": "rs_reasoning_123"},
  520. ]
  521. )
  522. assert sanitized == [{"role": "user", "content": "again"}]
  523. def test_store_false_sanitizer_skips_non_messages_and_nested_unencrypted_reasoning() -> None:
  524. sanitized = sanitize_store_false_responses_input(
  525. [
  526. {
  527. "type": "reasoning",
  528. "id": "rs_reasoning_123",
  529. "summary": [{"type": "summary_text", "text": "legacy"}],
  530. "status": "completed",
  531. },
  532. "ignored legacy output",
  533. {
  534. "role": "user",
  535. "content": [
  536. {"type": "reasoning", "summary": [{"type": "summary_text", "text": "nested"}]},
  537. {"type": "input_text", "text": "again"},
  538. ],
  539. },
  540. ]
  541. )
  542. assert sanitized == [{"role": "user", "content": [{"type": "input_text", "text": "again"}]}]
  543. def test_store_false_sanitizer_drops_prior_provider_outputs_before_legacy_reasoning() -> None:
  544. sanitized = sanitize_store_false_responses_input(
  545. [
  546. {
  547. "type": "message",
  548. "role": "assistant",
  549. "content": [{"type": "output_text", "text": "stale"}],
  550. },
  551. {
  552. "type": "reasoning",
  553. "id": "rs_reasoning_123",
  554. "summary": [{"type": "summary_text", "text": "legacy"}],
  555. "status": "completed",
  556. },
  557. {"role": "user", "content": "again"},
  558. ]
  559. )
  560. assert sanitized == [{"role": "user", "content": "again"}]
  561. def test_store_false_sanitizer_keeps_prior_encrypted_reasoning_boundary() -> None:
  562. encrypted_reasoning = _history_with_encrypted_reasoning()[0]
  563. sanitized = sanitize_store_false_responses_input(
  564. [
  565. encrypted_reasoning,
  566. {
  567. "type": "reasoning",
  568. "id": "rs_legacy_456",
  569. "summary": [{"type": "summary_text", "text": "legacy"}],
  570. "status": "completed",
  571. },
  572. {"role": "user", "content": "again"},
  573. ]
  574. )
  575. assert sanitized == [
  576. {
  577. "type": "reasoning",
  578. "id": "rs_reasoning_123",
  579. "summary": [{"type": "summary_text", "text": "looked up the answer"}],
  580. "content": [{"type": "reasoning_text", "text": "private"}],
  581. "encrypted_content": "encrypted_reasoning",
  582. "status": "completed",
  583. "agent": "TestAgent",
  584. "timestamp": 1,
  585. },
  586. {"role": "user", "content": "again"},
  587. ]
  588. def test_live_openai_store_false_replays_encrypted_reasoning() -> None:
  589. """Live OpenAI proof for stateless Responses reasoning replay."""
  590. client = OpenAI()
  591. first = client.responses.create(
  592. model="gpt-5.4-nano",
  593. input="Compute 37*41. Return only the number.",
  594. store=False,
  595. include=[REASONING_ENCRYPTED_CONTENT_INCLUDE],
  596. reasoning={"effort": "high"},
  597. max_output_tokens=64,
  598. )
  599. first_items = [item.model_dump(exclude_none=True) for item in first.output]
  600. reasoning_items = [item for item in first_items if item.get("type") == "reasoning"]
  601. output_types = [item.get("type") for item in first_items]
  602. reasoning_tokens = first.usage.output_tokens_details.reasoning_tokens if first.usage else None
  603. assert first.output_text.strip() == "1517"
  604. assert reasoning_items, f"Expected encrypted reasoning output item; got {output_types=} {reasoning_tokens=}"
  605. assert all(item.get("encrypted_content") for item in reasoning_items)
  606. replay_input = sanitize_store_false_responses_input(
  607. [
  608. *first_items,
  609. {
  610. "role": "user",
  611. "content": "What exact number did you just return? Return only that same number.",
  612. },
  613. ]
  614. )
  615. second = client.responses.create(
  616. model="gpt-5.4-nano",
  617. input=replay_input,
  618. store=False,
  619. include=[REASONING_ENCRYPTED_CONTENT_INCLUDE],
  620. reasoning={"effort": "high"},
  621. max_output_tokens=64,
  622. )
  623. assert second.output_text.strip() == "1517"
  624. @pytest.mark.asyncio
  625. async def test_codex_chat_name_store_false_uses_encrypted_reasoning_include() -> None:
  626. captured_inputs: list[list[TResponseInputItem]] = []
  627. captured_includes: list[list[str]] = []
  628. class _TitleStream:
  629. def __aiter__(self):
  630. return self
  631. async def __anext__(self):
  632. raise StopAsyncIteration
  633. class _Responses:
  634. async def create(self, **kwargs: Any) -> _TitleStream:
  635. captured_inputs.append(copy.deepcopy(kwargs["input"]))
  636. captured_includes.append(copy.deepcopy(kwargs["include"]))
  637. return _TitleStream()
  638. class _Client:
  639. base_url = "https://chatgpt.com/backend-api/codex"
  640. responses = _Responses()
  641. with pytest.raises(ValueError, match="Generated chat name"):
  642. await generate_chat_name(_history_with_encrypted_reasoning(), openai_client=_Client()) # type: ignore[arg-type]
  643. assert captured_inputs
  644. assert captured_includes
  645. assert all(include == [REASONING_ENCRYPTED_CONTENT_INCLUDE] for include in captured_includes)
  646. _assert_store_false_input_preserves_stateless_reasoning(captured_inputs[0])
  647. @pytest.mark.asyncio
  648. async def test_agency_get_response_persists_history_without_hidden_response_ids() -> None:
  649. model = _TrackingResponsesModel()
  650. persisted_history: list[dict[str, Any]] = []
  651. agency = _agency_factory_with_store(model, persisted_history)
  652. await agency.get_response(message="hi")
  653. _assert_messages_have_no_response_ids(persisted_history)
  654. await agency.get_response(message="again")
  655. assert model.seen_previous_response_ids == [None, None]
  656. _assert_history_input_has_no_response_ids(model.seen_inputs[1])
  657. @pytest.mark.asyncio
  658. async def test_agency_stream_persists_history_without_hidden_response_ids() -> None:
  659. model = _TrackingResponsesModel()
  660. persisted_history: list[dict[str, Any]] = []
  661. agency = _agency_factory_with_store(model, persisted_history)
  662. first_stream = agency.get_response_stream(message="hi")
  663. _first_events = [event async for event in first_stream]
  664. _assert_messages_have_no_response_ids(persisted_history)
  665. second_stream = agency.get_response_stream(message="again")
  666. _second_events = [event async for event in second_stream]
  667. assert model.seen_previous_response_ids == [None, None]
  668. _assert_history_input_has_no_response_ids(model.seen_inputs[1])