| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- # hydra:
- # run:
- # dir: ./my_output_dir # save outputs to custom dir
- root_dir: ${hydra:runtime.cwd}
- training:
- train_data_path: ${root_dir}/data/dat/train.dat
- val_data_path: ${root_dir}/data/dat/valid.dat
- save_path: "checkpoints" # save path for both first train and resume train
- lr: 0.0005
- min_lr: 0.0001
- weight_decay: 0.01
- batch_size: 64
- context_length: 256
- train_steps: 5000
- clip_grad_norm: 1.0
- warmup_iters: 500
- cosine_iters: 5000
- val_interval: 200
- val_batches: 20
- save_interval: 1000
- resume_checkpoint: null # load checkpoint from current hydra run dir
- model_type: qwen2_5
- model:
- architectures:
- - Qwen2ForCausalLM
- attention_dropout: 0.0
- bos_token_id: 151643
- eos_token_id: 151645
- hidden_act: silu
- hidden_size: 128 # 896
- initializer_range: 0.02
- intermediate_size: 384 # 4864
- max_position_embeddings: 512 # 32768
- max_window_layers: 21
- model_type: qwen2
- num_attention_heads: 8 # 14
- num_hidden_layers: 8 # 24
- num_key_value_heads: 2
- rms_norm_eps: 1e-06
- rope_theta: 1000000.0
- sliding_window: 512 # 32768
- tie_word_embeddings: true
- torch_dtype: bfloat16
- transformers_version: 4.43.1
- use_cache: true
- use_sliding_window: false
- vocab_size: 151936
- ## only used for pretrain Qwen
- dataset_split: train # `train` or `valid`, change as needed
- tokenizer:
- tokenizer_dir: ${root_dir}/tokenizers/${dataset_fname}_${dataset_split}/ # save path for tokenizer
- vocab_path: ${tokenizer_dir}/vocab.pkl
- merges_path: ${tokenizer_dir}/merges.pkl
- special_tokens: ["<|endoftext|>"]
|