tokenizer.yaml 1.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. # hydra:
  2. # run:
  3. # dir: ./my_output_dir # save outputs to custom dir
  4. root_dir: ${hydra:runtime.cwd} # 运行代码的根目录
  5. dataset_name: owt # `TinyStories` or `owt`, change as needed
  6. dataset_split: train # `train` or `valid`, change as needed
  7. data_dir: ${root_dir}/data/${dataset_name}
  8. txt_dir: ${data_dir}/txt
  9. dat_dir: ${data_dir}/dat
  10. train_txt_path: ${txt_dir}/train.txt
  11. valid_txt_path: ${txt_dir}/valid.txt
  12. input_path: ${txt_dir}/${dataset_split}.txt # txt data for training tokenizer
  13. train_dat_path: ${dat_dir}/train.dat
  14. valid_dat_path: ${dat_dir}/valid.dat
  15. tokenizer_dir: ${root_dir}/tokenizers/${dataset_name}_${dataset_split}/ # save path for tokenizer
  16. vocab_path: ${tokenizer_dir}/vocab.pkl
  17. merges_path: ${tokenizer_dir}/merges.pkl
  18. vocab_size_map:
  19. TinyStories: 10_000
  20. owt: 32_000
  21. # 根据 dataset_name 动态设置 vocab_size
  22. vocab_size: ${vocab_size_map.${dataset_name}}
  23. special_tokens: ["<|endoftext|>"]
  24. # train tokenizer config
  25. num_chunks: 32
  26. num_processes: 8
  27. # encode config
  28. batch_size: 10000 # 4096
  29. n_workers: 8 # 8