| 1234567891011121314151617181920212223242526272829303132 |
- import os
- import pickle
- import hydra
- from omegaconf import DictConfig
- # from clean_llm.tokenizer.train import run_train_bpe # slow version
- from clean_llm.tokenizer.train_fast import run_train_bpe # fast version
- @hydra.main(config_path="configs", config_name="tokenizer", version_base=None)
- def main(cfg: DictConfig):
- vocab, merges = run_train_bpe(
- input_path=cfg.input_path,
- vocab_size=cfg.vocab_size,
- special_tokens=cfg.special_tokens,
- num_chunks=cfg.num_chunks,
- num_processes=cfg.num_processes
- )
- os.makedirs(cfg.tokenizer_dir, exist_ok=True)
- with open(cfg.vocab_path, "wb") as f:
- pickle.dump(vocab, f)
- with open(cfg.merges_path, "wb") as f:
- pickle.dump(merges, f)
- # 统计最长token
- longest_token = max(vocab.values(), key=len)
- print("最长token:", longest_token, "长度:", len(longest_token))
- if __name__ == "__main__":
- main()
|