| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112 |
- import re
- import json
- import jsonlines
- from openai import OpenAI
- def batch_eval(query_file, result1_file, result2_file, output_file_path):
- client = OpenAI()
- with open(query_file, "r") as f:
- data = f.read()
- queries = re.findall(r"- Question \d+: (.+)", data)
- with open(result1_file, "r") as f:
- answers1 = json.load(f)
- answers1 = [i["result"] for i in answers1]
- with open(result2_file, "r") as f:
- answers2 = json.load(f)
- answers2 = [i["result"] for i in answers2]
- requests = []
- for i, (query, answer1, answer2) in enumerate(zip(queries, answers1, answers2)):
- sys_prompt = """
- ---Role---
- You are an expert tasked with evaluating two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**.
- """
- prompt = f"""
- You will evaluate two answers to the same question based on three criteria: **Comprehensiveness**, **Diversity**, and **Empowerment**.
- - **Comprehensiveness**: How much detail does the answer provide to cover all aspects and details of the question?
- - **Diversity**: How varied and rich is the answer in providing different perspectives and insights on the question?
- - **Empowerment**: How well does the answer help the reader understand and make informed judgments about the topic?
- For each criterion, choose the better answer (either Answer 1 or Answer 2) and explain why. Then, select an overall winner based on these three categories.
- Here is the question:
- {query}
- Here are the two answers:
- **Answer 1:**
- {answer1}
- **Answer 2:**
- {answer2}
- Evaluate both answers using the three criteria listed above and provide detailed explanations for each criterion.
- Output your evaluation in the following JSON format:
- {{
- "Comprehensiveness": {{
- "Winner": "[Answer 1 or Answer 2]",
- "Explanation": "[Provide explanation here]"
- }},
- "Diversity": {{
- "Winner": "[Answer 1 or Answer 2]",
- "Explanation": "[Provide explanation here]"
- }},
- "Empowerment": {{
- "Winner": "[Answer 1 or Answer 2]",
- "Explanation": "[Provide explanation here]"
- }},
- "Overall Winner": {{
- "Winner": "[Answer 1 or Answer 2]",
- "Explanation": "[Summarize why this answer is the overall winner based on the three criteria]"
- }}
- }}
- """
- request_data = {
- "custom_id": f"request-{i + 1}",
- "method": "POST",
- "url": "/v1/chat/completions",
- "body": {
- "model": "gpt-4o-mini",
- "messages": [
- {"role": "system", "content": sys_prompt},
- {"role": "user", "content": prompt},
- ],
- },
- }
- requests.append(request_data)
- with jsonlines.open(output_file_path, mode="w") as writer:
- for request in requests:
- writer.write(request)
- print(f"Batch API requests written to {output_file_path}")
- batch_input_file = client.files.create(
- file=open(output_file_path, "rb"), purpose="batch"
- )
- batch_input_file_id = batch_input_file.id
- batch = client.batches.create(
- input_file_id=batch_input_file_id,
- endpoint="/v1/chat/completions",
- completion_window="24h",
- metadata={"description": "nightly eval job"},
- )
- print(f"Batch {batch.id} has been created.")
- if __name__ == "__main__":
- batch_eval()
|