Train vocab in Python
This commit is contained in:
@@ -163,7 +163,7 @@ python tinystories.py train_vocab --vocab_size=4096
|
||||
python tinystories.py pretokenize --vocab_size=4096
|
||||
```
|
||||
|
||||
The `train_vocab` stage will call the `train_vocab.sh` script, which calls the `sentencepiece` library to train the tokenizer, storing it in a new file `data/tok4096.model`. I tried to reproduce as well as I could the settings that (I think) Meta used to train their vocabulary. This uses the Byte Pair Encoding algorithm that starts out with raw utf8 byte sequences of the text data and then iteratively merges the most common consecutive pairs of tokens to form the vocabulary. Inspect the `tinystories.py` file - the custom tokenizers are stored in a special directory structure indexed by the vocab size.
|
||||
The `train_vocab` stage will call the `sentencepiece` library to train the tokenizer, storing it in a new file `data/tok4096.model`. I tried to reproduce as well as I could the settings that (I think) Meta used to train their vocabulary. This uses the Byte Pair Encoding algorithm that starts out with raw utf8 byte sequences of the text data and then iteratively merges the most common consecutive pairs of tokens to form the vocabulary. Inspect the `tinystories.py` file - the custom tokenizers are stored in a special directory structure indexed by the vocab size.
|
||||
|
||||
A quick note of interest is that vocab size of 4096 trained specifically on tinystories creates integer sequences with about the same sequence length per example as the default Llama 2 tokenizer of 32000 tokens! This means that our custom, tailored tokenizer is a lot better adapted to our specific text, and can compress it very effectively. So our trained models are smaller and faster.
|
||||
|
||||
|
||||
+16
-10
@@ -13,6 +13,7 @@ from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
import sentencepiece as spm
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from tqdm import tqdm
|
||||
@@ -97,16 +98,21 @@ def train_vocab(vocab_size):
|
||||
of.write(text + "\n")
|
||||
print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")
|
||||
|
||||
# 2) run the train_vocab.sh script that trains the sentencepiece model
|
||||
print("Will now train the vocab with:")
|
||||
cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}"
|
||||
print(cmd)
|
||||
print("OK? [y/N] ")
|
||||
dec = input()
|
||||
if dec.lower() != "y":
|
||||
print("Exiting...")
|
||||
return
|
||||
os.system(cmd)
|
||||
# 2) train the sentencepiece model
|
||||
print("Will now train the vocab...")
|
||||
|
||||
spm.SentencePieceTrainer.train(input=tiny_file,
|
||||
model_prefix=prefix,
|
||||
model_type="bpe",
|
||||
vocab_size=vocab_size,
|
||||
self_test_sample_size=0,
|
||||
input_format="text",
|
||||
character_coverage=1.0,
|
||||
split_digits=True,
|
||||
allow_whitespace_only_pieces=True,
|
||||
byte_fallback=True,
|
||||
unk_surface=r" \342\201\207 ",
|
||||
normalization_rule_name="identity")
|
||||
|
||||
# 3) optional cleanup, ask the user if they'd like to delete tiny.txt
|
||||
dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")
|
||||
|
||||
Reference in New Issue
Block a user