Train vocab in Python

2023-08-23 17:28:14 +03:00
parent 7ac65cb2c2
commit fe9b9f2f15
2 changed files with 17 additions and 11 deletions
@@ -163,7 +163,7 @@ python tinystories.py train_vocab --vocab_size=4096
 python tinystories.py pretokenize --vocab_size=4096
 ```

-The `train_vocab` stage will call the `train_vocab.sh` script, which calls the `sentencepiece` library to train the tokenizer, storing it in a new file `data/tok4096.model`. I tried to reproduce as well as I could the settings that (I think) Meta used to train their vocabulary. This uses the Byte Pair Encoding algorithm that starts out with raw utf8 byte sequences of the text data and then iteratively merges the most common consecutive pairs of tokens to form the vocabulary. Inspect the `tinystories.py` file - the custom tokenizers are stored in a special directory structure indexed by the vocab size.
+The `train_vocab` stage will call the `sentencepiece` library to train the tokenizer, storing it in a new file `data/tok4096.model`. I tried to reproduce as well as I could the settings that (I think) Meta used to train their vocabulary. This uses the Byte Pair Encoding algorithm that starts out with raw utf8 byte sequences of the text data and then iteratively merges the most common consecutive pairs of tokens to form the vocabulary. Inspect the `tinystories.py` file - the custom tokenizers are stored in a special directory structure indexed by the vocab size.

 A quick note of interest is that vocab size of 4096 trained specifically on tinystories creates integer sequences with about the same sequence length per example as the default Llama 2 tokenizer of 32000 tokens! This means that our custom, tailored tokenizer is a lot better adapted to our specific text, and can compress it very effectively. So our trained models are smaller and faster.

@@ -13,6 +13,7 @@ from functools import partial

 import numpy as np
 import requests
+import sentencepiece as spm
 import torch
 import torch.distributed as dist
 from tqdm import tqdm
@@ -97,16 +98,21 @@ def train_vocab(vocab_size):
                of.write(text + "\n")
    print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")

-    # 2) run the train_vocab.sh script that trains the sentencepiece model
-    print("Will now train the vocab with:")
-    cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}"
-    print(cmd)
-    print("OK? [y/N] ")
-    dec = input()
-    if dec.lower() != "y":
-        print("Exiting...")
-        return
-    os.system(cmd)
+    # 2) train the sentencepiece model
+    print("Will now train the vocab...")
+
+    spm.SentencePieceTrainer.train(input=tiny_file,
+                                   model_prefix=prefix,
+                                   model_type="bpe",
+                                   vocab_size=vocab_size,
+                                   self_test_sample_size=0,
+                                   input_format="text",
+                                   character_coverage=1.0,
+                                   split_digits=True,
+                                   allow_whitespace_only_pieces=True,
+                                   byte_fallback=True,
+                                   unk_surface=r" \342\201\207 ",
+                                   normalization_rule_name="identity")

    # 3) optional cleanup, ask the user if they'd like to delete tiny.txt
    dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")