pretrain_cs336_lm.yaml 881 B

123456789101112131415161718192021222324252627282930313233343536373839
  1. # hydra:
  2. # run:
  3. # dir: ./my_output_dir # save outputs to custom dir
  4. root_dir: ${hydra:runtime.cwd}
  5. exp_name: cs336_lm_owt
  6. dataset_name: owt # `TinyStories` or `owt`, change as needed
  7. dat_dir: ${root_dir}/data/${dataset_name}/dat
  8. training:
  9. train_data_path: ${dat_dir}/train.dat
  10. val_data_path: ${dat_dir}/valid.dat
  11. save_path: "checkpoints/${exp_name}" # save path for both first train and resume train
  12. lr: 0.0005
  13. min_lr: 0.0001
  14. weight_decay: 0.01
  15. batch_size: 32
  16. context_length: 256
  17. train_steps: 5000
  18. clip_grad_norm: 1.0
  19. warmup_iters: 500
  20. cosine_iters: 5000
  21. val_interval: 100
  22. val_batches: 20
  23. save_interval: 1000
  24. resume_checkpoint: null # load checkpoint from current hydra run dir
  25. model_type: cs336_lm
  26. model:
  27. vocab_size: 32000
  28. context_length: 256
  29. d_model: 512
  30. num_layers: 4
  31. num_heads: 16
  32. d_ff: 1344
  33. rope_theta: 10000.0