# hydra: # run: # dir: ./my_output_dir # save outputs to custom dir root_dir: ${hydra:runtime.cwd} # 运行代码的根目录 dataset_name: owt # `TinyStories` or `owt`, change as needed dataset_split: train # `train` or `valid`, change as needed data_dir: ${root_dir}/data/${dataset_name} txt_dir: ${data_dir}/txt dat_dir: ${data_dir}/dat train_txt_path: ${txt_dir}/train.txt valid_txt_path: ${txt_dir}/valid.txt input_path: ${txt_dir}/${dataset_split}.txt # txt data for training tokenizer train_dat_path: ${dat_dir}/train.dat valid_dat_path: ${dat_dir}/valid.dat tokenizer_dir: ${root_dir}/tokenizers/${dataset_name}_${dataset_split}/ # save path for tokenizer vocab_path: ${tokenizer_dir}/vocab.pkl merges_path: ${tokenizer_dir}/merges.pkl vocab_size_map: TinyStories: 10_000 owt: 32_000 # 根据 dataset_name 动态设置 vocab_size vocab_size: ${vocab_size_map.${dataset_name}} special_tokens: ["<|endoftext|>"] # train tokenizer config num_chunks: 32 num_processes: 8 # encode config batch_size: 10000 # 4096 n_workers: 8 # 8