pretrain_qwen2_5.yaml 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. # hydra:
  2. # run:
  3. # dir: ./my_output_dir # save outputs to custom dir
  4. root_dir: ${hydra:runtime.cwd}
  5. training:
  6. train_data_path: ${root_dir}/data/dat/train.dat
  7. val_data_path: ${root_dir}/data/dat/valid.dat
  8. save_path: "checkpoints" # save path for both first train and resume train
  9. lr: 0.0005
  10. min_lr: 0.0001
  11. weight_decay: 0.01
  12. batch_size: 64
  13. context_length: 256
  14. train_steps: 5000
  15. clip_grad_norm: 1.0
  16. warmup_iters: 500
  17. cosine_iters: 5000
  18. val_interval: 200
  19. val_batches: 20
  20. save_interval: 1000
  21. resume_checkpoint: null # load checkpoint from current hydra run dir
  22. model_type: qwen2_5
  23. model:
  24. architectures:
  25. - Qwen2ForCausalLM
  26. attention_dropout: 0.0
  27. bos_token_id: 151643
  28. eos_token_id: 151645
  29. hidden_act: silu
  30. hidden_size: 128 # 896
  31. initializer_range: 0.02
  32. intermediate_size: 384 # 4864
  33. max_position_embeddings: 512 # 32768
  34. max_window_layers: 21
  35. model_type: qwen2
  36. num_attention_heads: 8 # 14
  37. num_hidden_layers: 8 # 24
  38. num_key_value_heads: 2
  39. rms_norm_eps: 1e-06
  40. rope_theta: 1000000.0
  41. sliding_window: 512 # 32768
  42. tie_word_embeddings: true
  43. torch_dtype: bfloat16
  44. transformers_version: 4.43.1
  45. use_cache: true
  46. use_sliding_window: false
  47. vocab_size: 151936
  48. ## only used for pretrain Qwen
  49. dataset_split: train # `train` or `valid`, change as needed
  50. tokenizer:
  51. tokenizer_dir: ${root_dir}/tokenizers/${dataset_fname}_${dataset_split}/ # save path for tokenizer
  52. vocab_path: ${tokenizer_dir}/vocab.pkl
  53. merges_path: ${tokenizer_dir}/merges.pkl
  54. special_tokens: ["<|endoftext|>"]