batch_eval.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. import re
  2. import json
  3. import jsonlines
  4. from openai import OpenAI
  5. def batch_eval(query_file, result1_file, result2_file, output_file_path):
  6. client = OpenAI()
  7. with open(query_file, "r") as f:
  8. data = f.read()
  9. queries = re.findall(r"- Question \d+: (.+)", data)
  10. with open(result1_file, "r") as f:
  11. answers1 = json.load(f)
  12. answers1 = [i["result"] for i in answers1]
  13. with open(result2_file, "r") as f:
  14. answers2 = json.load(f)
  15. answers2 = [i["result"] for i in answers2]
  16. requests = []
  17. for i, (query, answer1, answer2) in enumerate(zip(queries, answers1, answers2)):
  18. sys_prompt = """
  19. ---Role---
  20. You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**.
  21. """
  22. prompt = f"""
  23. You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**.
  24. - **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question?
  25. - **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question?
  26. - **Empowerment**: How well does the answer help the reader understand and make informed judgments about the topic?
  27. For each criterion, choose the better answer (either Answer 1 or Answer 2) and explain why. Then, select an overall winner based on these three categories.
  28. Here is the question:
  29. {query}
  30. Here are the two answers:
  31. **Answer 1:**
  32. {answer1}
  33. **Answer 2:**
  34. {answer2}
  35. Evaluate both answers using the three criteria listed above and provide detailed explanations for each criterion.
  36. Output your evaluation in the following JSON format:
  37. {{
  38. "Comprehensiveness": {{
  39. "Winner": "[Answer 1 or Answer 2]",
  40. "Explanation": "[Provide explanation here]"
  41. }},
  42. "Diversity": {{
  43. "Winner": "[Answer 1 or Answer 2]",
  44. "Explanation": "[Provide explanation here]"
  45. }},
  46. "Empowerment": {{
  47. "Winner": "[Answer 1 or Answer 2]",
  48. "Explanation": "[Provide explanation here]"
  49. }},
  50. "Overall Winner": {{
  51. "Winner": "[Answer 1 or Answer 2]",
  52. "Explanation": "[Summarize why this answer is the overall winner based on the three criteria]"
  53. }}
  54. }}
  55. """
  56. request_data = {
  57. "custom_id": f"request-{i + 1}",
  58. "method": "POST",
  59. "url": "/v1/chat/completions",
  60. "body": {
  61. "model": "gpt-4o-mini",
  62. "messages": [
  63. {"role": "system", "content": sys_prompt},
  64. {"role": "user", "content": prompt},
  65. ],
  66. },
  67. }
  68. requests.append(request_data)
  69. with jsonlines.open(output_file_path, mode="w") as writer:
  70. for request in requests:
  71. writer.write(request)
  72. print(f"Batch API requests written to {output_file_path}")
  73. batch_input_file = client.files.create(
  74. file=open(output_file_path, "rb"), purpose="batch"
  75. )
  76. batch_input_file_id = batch_input_file.id
  77. batch = client.batches.create(
  78. input_file_id=batch_input_file_id,
  79. endpoint="/v1/chat/completions",
  80. completion_window="24h",
  81. metadata={"description": "nightly eval job"},
  82. )
  83. print(f"Batch {batch.id} has been created.")
  84. if __name__ == "__main__":
  85. batch_eval()