| 1234567891011121314151617181920212223242526272829303132333435363738394041 |
- # hydra:
- # run:
- # dir: ./my_output_dir # save outputs to custom dir
- root_dir: ${hydra:runtime.cwd} # 运行代码的根目录
- dataset_name: owt # `TinyStories` or `owt`, change as needed
- dataset_split: train # `train` or `valid`, change as needed
- data_dir: ${root_dir}/data/${dataset_name}
- txt_dir: ${data_dir}/txt
- dat_dir: ${data_dir}/dat
- train_txt_path: ${txt_dir}/train.txt
- valid_txt_path: ${txt_dir}/valid.txt
- input_path: ${txt_dir}/${dataset_split}.txt # txt data for training tokenizer
- train_dat_path: ${dat_dir}/train.dat
- valid_dat_path: ${dat_dir}/valid.dat
- tokenizer_dir: ${root_dir}/tokenizers/${dataset_name}_${dataset_split}/ # save path for tokenizer
- vocab_path: ${tokenizer_dir}/vocab.pkl
- merges_path: ${tokenizer_dir}/merges.pkl
- vocab_size_map:
- TinyStories: 10_000
- owt: 32_000
- # 根据 dataset_name 动态设置 vocab_size
- vocab_size: ${vocab_size_map.${dataset_name}}
- special_tokens: ["<|endoftext|>"]
- # train tokenizer config
- num_chunks: 32
- num_processes: 8
- # encode config
- batch_size: 10000 # 4096
- n_workers: 8 # 8
|