Step_0.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. import os
  2. import json
  3. import glob
  4. import argparse
  5. def extract_unique_contexts(input_directory, output_directory):
  6. os.makedirs(output_directory, exist_ok=True)
  7. jsonl_files = glob.glob(os.path.join(input_directory, "*.jsonl"))
  8. print(f"Found {len(jsonl_files)} JSONL files.")
  9. for file_path in jsonl_files:
  10. filename = os.path.basename(file_path)
  11. name, ext = os.path.splitext(filename)
  12. output_filename = f"{name}_unique_contexts.json"
  13. output_path = os.path.join(output_directory, output_filename)
  14. unique_contexts_dict = {}
  15. print(f"Processing file: {filename}")
  16. try:
  17. with open(file_path, "r", encoding="utf-8") as infile:
  18. for line_number, line in enumerate(infile, start=1):
  19. line = line.strip()
  20. if not line:
  21. continue
  22. try:
  23. json_obj = json.loads(line)
  24. context = json_obj.get("context")
  25. if context and context not in unique_contexts_dict:
  26. unique_contexts_dict[context] = None
  27. except json.JSONDecodeError as e:
  28. print(
  29. f"JSON decoding error in file {filename} at line {line_number}: {e}"
  30. )
  31. except FileNotFoundError:
  32. print(f"File not found: {filename}")
  33. continue
  34. except Exception as e:
  35. print(f"An error occurred while processing file {filename}: {e}")
  36. continue
  37. unique_contexts_list = list(unique_contexts_dict.keys())
  38. print(
  39. f"There are {len(unique_contexts_list)} unique `context` entries in the file {filename}."
  40. )
  41. try:
  42. with open(output_path, "w", encoding="utf-8") as outfile:
  43. json.dump(unique_contexts_list, outfile, ensure_ascii=False, indent=4)
  44. print(f"Unique `context` entries have been saved to: {output_filename}")
  45. except Exception as e:
  46. print(f"An error occurred while saving to the file {output_filename}: {e}")
  47. print("All files have been processed.")
  48. if __name__ == "__main__":
  49. parser = argparse.ArgumentParser()
  50. parser.add_argument("-i", "--input_dir", type=str, default="../datasets")
  51. parser.add_argument(
  52. "-o", "--output_dir", type=str, default="../datasets/unique_contexts"
  53. )
  54. args = parser.parse_args()
  55. extract_unique_contexts(args.input_dir, args.output_dir)