tokenize.py 693 B

12345678910111213141516
  1. import hydra
  2. from omegaconf import DictConfig
  3. from clean_llm.tokenizer.tokenizer import get_custom_tokenizer, encode_txt_as_array
  4. @hydra.main(config_path="configs", config_name="tokenizer", version_base=None)
  5. def main(cfg: DictConfig):
  6. tokenizer = get_custom_tokenizer(vocab_path=cfg.vocab_path,
  7. merges_path=cfg.merges_path,
  8. special_tokens=cfg.special_tokens)
  9. encode_txt_as_array(tokenizer, cfg.train_txt_path, cfg.train_dat_path, cfg.batch_size, cfg.n_workers)
  10. encode_txt_as_array(tokenizer, cfg.valid_txt_path, cfg.valid_dat_path, cfg.batch_size, cfg.n_workers)
  11. if __name__ == "__main__":
  12. main()