| import datasets |
| from transformers import AutoTokenizer |
|
|
| dataset = datasets.load_from_disk("/researchdisk/lm_training_dataset_v2_filtered") |
| dataset = dataset["train"].train_test_split(train_size=0.02) |
|
|
| old_tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b_700bt_preview") |
|
|
| def get_training_corpus(): |
| return ( |
| dataset["train"][i : i + 1000]["text"] |
| for i in range(0, len(dataset["train"]), 1000) |
| ) |
|
|
|
|
| training_corpus = get_training_corpus() |
|
|
| tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=64256, min_frequency=2) |
| tokenizer.save_pretrained("./") |