Step_2.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. import json
  2. from openai import OpenAI
  3. from transformers import GPT2Tokenizer
  4. def openai_complete_if_cache(
  5. model="gpt-4o", prompt=None, system_prompt=None, history_messages=[], **kwargs
  6. ) -> str:
  7. openai_client = OpenAI()
  8. messages = []
  9. if system_prompt:
  10. messages.append({"role": "system", "content": system_prompt})
  11. messages.extend(history_messages)
  12. messages.append({"role": "user", "content": prompt})
  13. response = openai_client.chat.completions.create(
  14. model=model, messages=messages, **kwargs
  15. )
  16. return response.choices[0].message.content
  17. tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
  18. def get_summary(context, tot_tokens=2000):
  19. tokens = tokenizer.tokenize(context)
  20. half_tokens = tot_tokens // 2
  21. start_tokens = tokens[1000 : 1000 + half_tokens]
  22. end_tokens = tokens[-(1000 + half_tokens) : 1000]
  23. summary_tokens = start_tokens + end_tokens
  24. summary = tokenizer.convert_tokens_to_string(summary_tokens)
  25. return summary
  26. clses = ["agriculture"]
  27. for cls in clses:
  28. with open(f"../datasets/unique_contexts/{cls}_unique_contexts.json", mode="r") as f:
  29. unique_contexts = json.load(f)
  30. summaries = [get_summary(context) for context in unique_contexts]
  31. total_description = "\n\n".join(summaries)
  32. prompt = f"""
  33. Given the following description of a dataset:
  34. {total_description}
  35. Please identify 5 potential users who would engage with this dataset. For each user, list 5 tasks they would perform with this dataset. Then, for each (user, task) combination, generate 5 questions that require a high-level understanding of the entire dataset.
  36. Output the results in the following structure:
  37. - User 1: [user description]
  38. - Task 1: [task description]
  39. - Question 1:
  40. - Question 2:
  41. - Question 3:
  42. - Question 4:
  43. - Question 5:
  44. - Task 2: [task description]
  45. ...
  46. - Task 5: [task description]
  47. - User 2: [user description]
  48. ...
  49. - User 5: [user description]
  50. ...
  51. """
  52. result = openai_complete_if_cache(model="gpt-4o", prompt=prompt)
  53. file_path = f"../datasets/questions/{cls}_questions.txt"
  54. with open(file_path, "w") as file:
  55. file.write(result)
  56. print(f"{cls}_questions written to {file_path}")