add the ability to train a custom sentencepiece tokenizer with a given vocab_size, and pretok with it. some more changes still needed to merge this branch, in train.py and ofc run.c. did this in a sadly bit ugly, but fully backwards compatible way. basically when we use custom tokenizer we create a whole new directory structure for that

2023-08-11 03:58:22 +00:00
parent c42641205f
commit 4c6f0af9ff
3 changed files with 233 additions and 21 deletions
@@ -9,6 +9,7 @@ import os
 import random
 from typing import List
 from concurrent.futures import ProcessPoolExecutor
 from functools import partial
 import numpy as np
 import requests
@@ -37,7 +38,7 @@ def download_file(url: str, fname: str, chunk_size=1024):
 def download():
-    """Downloads the dataset to disk."""
+    """Downloads the TinyStories dataset to DATA_CACHE_DIR"""
    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
    # download the TinyStories dataset, unless it's already downloaded
@@ -66,10 +67,63 @@ def download():
    print(f"Number of shards: {len(shard_filenames)}")
    print(f"Example story:\n{data[0]}")
 def train_vocab(vocab_size):
    """
    Trains a custom sentencepiece tokenizer on the TinyStories dataset.
    The custom tokenizer files will be saved in DATA_CACHE_DIR/tok{N} directories,
    where N is the vocab size. This is also where the pretok .bin files will go.
    """
    assert vocab_size > 0, "Vocab size must be positive"
-def process_shard(args):
+    # output file prefix path for sentencepiece
    prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
    # how many shards we'll use for vocab training, kept low for efficiency
    num_shards = 10
    # 1) export a large chunk of text as a single text file tiny.txt
    tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt")
    data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
    print(f"Writing temporary file {tiny_file} with {num_shards} shards...")
    with open(tiny_file, "w") as of:
        for shard in tqdm(shard_filenames[:num_shards]):
            with open(shard, "r") as f:
                data = json.load(f)
            for example in data:
                text = example["story"]
                text = text.strip()
                of.write(text + "\n")
    print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")
    # 2) run the train_vocab.sh script that trains the sentencepiece model
    print("Will now train the vocab with:")
    cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}"
    print(cmd)
    print("OK? [y/N] ")
    dec = input()
    if dec.lower() != "y":
        print("Exiting...")
        return
    os.system(cmd)
    # 3) optional cleanup, ask the user if they'd like to delete tiny.txt
    dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")
    if dec.lower() == "y":
        os.remove(tiny_file)
        print(f"Deleted {tiny_file}")
    print(f"Trained tokenizer is in {prefix}.model")
    print("Done.")
 def process_shard(args, vocab_size):
    shard_id, shard = args
-    enc = Tokenizer()
+    tokenizer_model = None
    if vocab_size > 0:
        tokenizer_model = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model")
    enc = Tokenizer(tokenizer_model)
    with open(shard, "r") as f:
        data = json.load(f)
    all_tokens = []
@@ -80,21 +134,37 @@ def process_shard(args):
        all_tokens.extend(tokens)
    # convert to uint16 nparray
    all_tokens = np.array(all_tokens, dtype=np.uint16)
-    # write to disk
+    # calculate the output filename
-    tokenized_filename = shard.replace(".json", ".bin")
+    if vocab_size == 0:
        # if we're using Llama 2, just save the tokenized file in the same dir
        tokenized_filename = shard.replace(".json", ".bin")
    else:
        # save .bin files into a new tok{N} directory
        bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
        shard_basename = os.path.basename(shard)
        bin_basename = shard_basename.replace(".json", ".bin")
        tokenized_filename = os.path.join(bin_dir, bin_basename)
    # write the bytes
    with open(tokenized_filename, "wb") as f:
        f.write(all_tokens.tobytes())
-    print(f"Saved {tokenized_filename}")
+    # calculate the average sequence length (they are separated by BOS=1)
    avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
    print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}")
-def pretokenize():
+def pretokenize(vocab_size):
    # iterate the shards and tokenize all of them one by one
    data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
    if vocab_size > 0:
        # .bin files will be saved into tok{N} directory, create it once here
        bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
        os.makedirs(bin_dir, exist_ok=True)
    # process all the shards in a process pool
    fun = partial(process_shard, vocab_size=vocab_size)
    with ProcessPoolExecutor() as executor:
-        executor.map(process_shard, enumerate(shard_filenames))
+        executor.map(fun, enumerate(shard_filenames))
    print("Done.")
@@ -155,14 +225,29 @@ class Task:
 if __name__ == "__main__":
    """
    These stages are designed to be run in order.
    To tokenize data with the Llama 2 tokenizer:
    python tinystories.py download
    python tinystories.py pretokenize
    To tokenize data with a custom tokenizer we train ourselves with sentencepiece, e.g.:
    python tinystories.py download
    python tinystories.py train_vocab --vocab_size=2048
    python tinystories.py pretokenize --vocab_size=2048
    """
    parser = argparse.ArgumentParser()
-    parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"])
+    parser.add_argument("stage", type=str, choices=["download", "pretokenize", "train_vocab"])
    parser.add_argument("--vocab_size", type=int, default=0, help="pretokenization vocab size. 0 = use Llama 2 tokenizer.")
    args = parser.parse_args()
    # depending on the stage call the appropriate function
-    fun = {
+    if args.stage == "download":
-        "download": download,
+        download()
-        "pretokenize": pretokenize,
+    elif args.stage == "train_vocab":
-    }
+        train_vocab(vocab_size=args.vocab_size)
-    fun[args.stage]()
+    elif args.stage == "pretokenize":
-
+        pretokenize(vocab_size=args.vocab_size)
    else:
        raise ValueError(f"Unknown stage {args.stage}")
@@ -10,14 +10,13 @@ from typing import List
 from sentencepiece import SentencePieceProcessor
 TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
 TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C
 class Tokenizer:
-    def __init__(self):
+    def __init__(self, tokenizer_model=None):
-        model_path = TOKENIZER_MODEL
+        model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
        assert os.path.isfile(model_path), model_path
        self.sp_model = SentencePieceProcessor(model_file=model_path)
-        #print(f"Loaded SentencePiece model from {model_path}")
+        self.model_path = model_path
        # BOS / EOS token IDs
        self.n_words: int = self.sp_model.vocab_size()
@@ -64,7 +63,9 @@ class Tokenizer:
        max_token_length = max(len(t) for t in tokens)
        # write to a binary file
-        with open(TOKENIZER_BIN, 'wb') as f:
+        # the tokenizer.bin file is the same as .model file, but .bin
        tokenizer_bin = self.model_path.replace('.model', '.bin')
        with open(tokenizer_bin, 'wb') as f:
            f.write(struct.pack("I", max_token_length))
            for bytes, score in zip(tokens, scores):
                f.write(struct.pack("fI", score, len(bytes)))
@@ -0,0 +1,126 @@
 #!/bin/bash
 # Trains a sentencepiece tokenizer model on a bunch of given data, my best
 # effort attempt to replicate how Meta trained their Llama 2 tokenizer.
 # usage: $ train_vocab.sh <input> <model_prefix> <vocab_size>
 # example:
 # ./train_vocab.sh tiny.txt tokenizer_tiny 1024
 # requirements:
 # install https://github.com/google/sentencepiece
 # check if the correct number of arguments are provided
 if [ $# -ne 3 ]; then
    echo "Usage: $0 <input> <model_prefix> <vocab_size>"
    exit 1
 fi
 # assign command-line arguments to variables
 input=$1
 model_prefix=$2
 vocab_size=$3
 # check if input file exists
 if [ ! -f "$input" ]; then
    echo "Usage: $0 <input> <model_prefix> <vocab_size>"
    echo "input '$input' not found."
    exit 1
 fi
 # check if vocab_size is a positive integer
 if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -lt 1 ]; then
    echo "Usage: $0 <input> <model_prefix> <vocab_size>"
    echo "vocab_size size must be a positive integer."
    exit 1
 fi
 # Print the processed inputs
 echo "Input: $input"
 echo "Model Prefix: $model_prefix"
 echo "Vocabulary Size: $vocab_size"
 # train a sentencepiece tokenizer model
 # Llama 2 config can be printed as follows:
 # import sentencepiece.sentencepiece_model_pb2
 # mp = sentencepiece.sentencepiece_model_pb2.ModelProto()
 # mp.ParseFromString(open("tokenizer.model", "rb").read())
 # print(mp.trainer_spec)
 # print(mp.normalizer_spec)
 # this gives:
 # trainer_spec {
 #   input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
 #   model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
 #   model_type: BPE
 #   vocab_size: 32000
 #   self_test_sample_size: 0
 #   input_format: "text"
 #   character_coverage: 0.9999499917030334
 #   input_sentence_size: 200000000
 #   seed_sentencepiece_size: 1000000
 #   shrinking_factor: 0.75
 #   num_threads: 80
 #   num_sub_iterations: 2
 #   max_sentence_length: 4192
 #   shuffle_input_sentence: true
 #   max_sentencepiece_length: 16
 #   split_by_unicode_script: true
 #   split_by_whitespace: true
 #   split_by_number: true
 #   treat_whitespace_as_suffix: false
 #   split_digits: true
 #   allow_whitespace_only_pieces: true
 #   vocabulary_output_piece_score: true
 #   hard_vocab_limit: true
 #   use_all_vocab: false
 #   byte_fallback: true
 #   required_chars: ""
 #   unk_id: 0
 #   bos_id: 1
 #   eos_id: 2
 #   pad_id: -1
 #   unk_surface: " \342\201\207 "
 #   unk_piece: "<unk>"
 #   bos_piece: "<s>"
 #   eos_piece: "</s>"
 #   pad_piece: "<pad>"
 #   train_extremely_large_corpus: false
 #   enable_differential_privacy: false
 #   differential_privacy_noise_level: 0.0
 #   differential_privacy_clipping_threshold: 0
 # }
 # normalizer_spec {
 #   name: "identity"
 #   precompiled_charsmap: ""
 #   add_dummy_prefix: true
 #   remove_extra_whitespaces: false
 #   normalization_rule_tsv: ""
 # }
 # let's now use spm_train to train this exact model
 # options docs: https://github.com/google/sentencepiece/blob/master/doc/options.md
 # we'll depart on a few settings:
 # character_coverage -> 1.0
 # other important notes:
 # --split-digits = true, per the paper
 # --allow_whitespace_only_pieces is true, default in spm is false
 # --byte_fallback is true, default in spm is false
 # --normalization_rule_name is identity, default in spm is nmt_nfkc
 spm_train --input="$input" \
          --model_prefix="$model_prefix" \
          --model_type=bpe \
          --vocab_size="$vocab_size" \
          --self_test_sample_size=0 \
          --input_format="text" \
          --character_coverage=1.0 \
          --num_threads="$(nproc)" \
          --split_digits=true \
          --allow_whitespace_only_pieces=true \
          --byte_fallback=true \
          --unk_surface=" \342\201\207 " \
          --normalization_rule_name=identity \