From 4c6f0af9ff3671b0b8053c6a3a512a06bad5c676 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 11 Aug 2023 03:58:22 +0000 Subject: [PATCH] add the ability to train a custom sentencepiece tokenizer with a given vocab_size, and pretok with it. some more changes still needed to merge this branch, in train.py and ofc run.c. did this in a sadly bit ugly, but fully backwards compatible way. basically when we use custom tokenizer we create a whole new directory structure for that --- tinystories.py | 115 ++++++++++++++++++++++++++++++++++++++------ tokenizer.py | 13 ++--- train_vocab.sh | 126 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 233 insertions(+), 21 deletions(-) create mode 100755 train_vocab.sh diff --git a/tinystories.py b/tinystories.py index 419e0d5..d41f8fc 100644 --- a/tinystories.py +++ b/tinystories.py @@ -9,6 +9,7 @@ import os import random from typing import List from concurrent.futures import ProcessPoolExecutor +from functools import partial import numpy as np import requests @@ -37,7 +38,7 @@ def download_file(url: str, fname: str, chunk_size=1024): def download(): - """Downloads the dataset to disk.""" + """Downloads the TinyStories dataset to DATA_CACHE_DIR""" os.makedirs(DATA_CACHE_DIR, exist_ok=True) # download the TinyStories dataset, unless it's already downloaded @@ -66,10 +67,63 @@ def download(): print(f"Number of shards: {len(shard_filenames)}") print(f"Example story:\n{data[0]}") +def train_vocab(vocab_size): + """ + Trains a custom sentencepiece tokenizer on the TinyStories dataset. + The custom tokenizer files will be saved in DATA_CACHE_DIR/tok{N} directories, + where N is the vocab size. This is also where the pretok .bin files will go. + """ + assert vocab_size > 0, "Vocab size must be positive" -def process_shard(args): + # output file prefix path for sentencepiece + prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}") + + # how many shards we'll use for vocab training, kept low for efficiency + num_shards = 10 + + # 1) export a large chunk of text as a single text file tiny.txt + tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt") + data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") + shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json"))) + + print(f"Writing temporary file {tiny_file} with {num_shards} shards...") + with open(tiny_file, "w") as of: + for shard in tqdm(shard_filenames[:num_shards]): + with open(shard, "r") as f: + data = json.load(f) + for example in data: + text = example["story"] + text = text.strip() + of.write(text + "\n") + print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB") + + # 2) run the train_vocab.sh script that trains the sentencepiece model + print("Will now train the vocab with:") + cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}" + print(cmd) + print("OK? [y/N] ") + dec = input() + if dec.lower() != "y": + print("Exiting...") + return + os.system(cmd) + + # 3) optional cleanup, ask the user if they'd like to delete tiny.txt + dec = input(f"Delete the temporary file {tiny_file}? [y/N] ") + if dec.lower() == "y": + os.remove(tiny_file) + print(f"Deleted {tiny_file}") + + print(f"Trained tokenizer is in {prefix}.model") + print("Done.") + + +def process_shard(args, vocab_size): shard_id, shard = args - enc = Tokenizer() + tokenizer_model = None + if vocab_size > 0: + tokenizer_model = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model") + enc = Tokenizer(tokenizer_model) with open(shard, "r") as f: data = json.load(f) all_tokens = [] @@ -80,21 +134,37 @@ def process_shard(args): all_tokens.extend(tokens) # convert to uint16 nparray all_tokens = np.array(all_tokens, dtype=np.uint16) - # write to disk - tokenized_filename = shard.replace(".json", ".bin") + # calculate the output filename + if vocab_size == 0: + # if we're using Llama 2, just save the tokenized file in the same dir + tokenized_filename = shard.replace(".json", ".bin") + else: + # save .bin files into a new tok{N} directory + bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}") + shard_basename = os.path.basename(shard) + bin_basename = shard_basename.replace(".json", ".bin") + tokenized_filename = os.path.join(bin_dir, bin_basename) + # write the bytes with open(tokenized_filename, "wb") as f: f.write(all_tokens.tobytes()) - print(f"Saved {tokenized_filename}") + # calculate the average sequence length (they are separated by BOS=1) + avg_seq_len = all_tokens.size / ((all_tokens == 1).sum()) + print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}") -def pretokenize(): +def pretokenize(vocab_size): # iterate the shards and tokenize all of them one by one data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json"))) + if vocab_size > 0: + # .bin files will be saved into tok{N} directory, create it once here + bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}") + os.makedirs(bin_dir, exist_ok=True) # process all the shards in a process pool + fun = partial(process_shard, vocab_size=vocab_size) with ProcessPoolExecutor() as executor: - executor.map(process_shard, enumerate(shard_filenames)) + executor.map(fun, enumerate(shard_filenames)) print("Done.") @@ -155,14 +225,29 @@ class Task: if __name__ == "__main__": + """ + These stages are designed to be run in order. + + To tokenize data with the Llama 2 tokenizer: + python tinystories.py download + python tinystories.py pretokenize + + To tokenize data with a custom tokenizer we train ourselves with sentencepiece, e.g.: + python tinystories.py download + python tinystories.py train_vocab --vocab_size=2048 + python tinystories.py pretokenize --vocab_size=2048 + """ parser = argparse.ArgumentParser() - parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"]) + parser.add_argument("stage", type=str, choices=["download", "pretokenize", "train_vocab"]) + parser.add_argument("--vocab_size", type=int, default=0, help="pretokenization vocab size. 0 = use Llama 2 tokenizer.") args = parser.parse_args() # depending on the stage call the appropriate function - fun = { - "download": download, - "pretokenize": pretokenize, - } - fun[args.stage]() - + if args.stage == "download": + download() + elif args.stage == "train_vocab": + train_vocab(vocab_size=args.vocab_size) + elif args.stage == "pretokenize": + pretokenize(vocab_size=args.vocab_size) + else: + raise ValueError(f"Unknown stage {args.stage}") diff --git a/tokenizer.py b/tokenizer.py index 35eee20..981b2ac 100644 --- a/tokenizer.py +++ b/tokenizer.py @@ -10,14 +10,13 @@ from typing import List from sentencepiece import SentencePieceProcessor TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model -TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C class Tokenizer: - def __init__(self): - model_path = TOKENIZER_MODEL + def __init__(self, tokenizer_model=None): + model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL assert os.path.isfile(model_path), model_path self.sp_model = SentencePieceProcessor(model_file=model_path) - #print(f"Loaded SentencePiece model from {model_path}") + self.model_path = model_path # BOS / EOS token IDs self.n_words: int = self.sp_model.vocab_size() @@ -59,12 +58,14 @@ class Tokenizer: tokens.append(b) scores.append(s) - + # record the max token length max_token_length = max(len(t) for t in tokens) # write to a binary file - with open(TOKENIZER_BIN, 'wb') as f: + # the tokenizer.bin file is the same as .model file, but .bin + tokenizer_bin = self.model_path.replace('.model', '.bin') + with open(tokenizer_bin, 'wb') as f: f.write(struct.pack("I", max_token_length)) for bytes, score in zip(tokens, scores): f.write(struct.pack("fI", score, len(bytes))) diff --git a/train_vocab.sh b/train_vocab.sh new file mode 100755 index 0000000..7803af8 --- /dev/null +++ b/train_vocab.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +# Trains a sentencepiece tokenizer model on a bunch of given data, my best +# effort attempt to replicate how Meta trained their Llama 2 tokenizer. + +# usage: $ train_vocab.sh +# example: +# ./train_vocab.sh tiny.txt tokenizer_tiny 1024 +# requirements: +# install https://github.com/google/sentencepiece + +# check if the correct number of arguments are provided +if [ $# -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi + +# assign command-line arguments to variables +input=$1 +model_prefix=$2 +vocab_size=$3 + +# check if input file exists +if [ ! -f "$input" ]; then + echo "Usage: $0 " + echo "input '$input' not found." + exit 1 +fi + +# check if vocab_size is a positive integer +if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -lt 1 ]; then + echo "Usage: $0 " + echo "vocab_size size must be a positive integer." + exit 1 +fi + +# Print the processed inputs +echo "Input: $input" +echo "Model Prefix: $model_prefix" +echo "Vocabulary Size: $vocab_size" + +# train a sentencepiece tokenizer model +# Llama 2 config can be printed as follows: + +# import sentencepiece.sentencepiece_model_pb2 +# mp = sentencepiece.sentencepiece_model_pb2.ModelProto() +# mp.ParseFromString(open("tokenizer.model", "rb").read()) +# print(mp.trainer_spec) +# print(mp.normalizer_spec) + +# this gives: + +# trainer_spec { +# input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged" +# model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2" +# model_type: BPE +# vocab_size: 32000 +# self_test_sample_size: 0 +# input_format: "text" +# character_coverage: 0.9999499917030334 +# input_sentence_size: 200000000 +# seed_sentencepiece_size: 1000000 +# shrinking_factor: 0.75 +# num_threads: 80 +# num_sub_iterations: 2 +# max_sentence_length: 4192 +# shuffle_input_sentence: true +# max_sentencepiece_length: 16 +# split_by_unicode_script: true +# split_by_whitespace: true +# split_by_number: true +# treat_whitespace_as_suffix: false +# split_digits: true +# allow_whitespace_only_pieces: true +# vocabulary_output_piece_score: true +# hard_vocab_limit: true +# use_all_vocab: false +# byte_fallback: true +# required_chars: "" +# unk_id: 0 +# bos_id: 1 +# eos_id: 2 +# pad_id: -1 +# unk_surface: " \342\201\207 " +# unk_piece: "" +# bos_piece: "" +# eos_piece: "" +# pad_piece: "" +# train_extremely_large_corpus: false +# enable_differential_privacy: false +# differential_privacy_noise_level: 0.0 +# differential_privacy_clipping_threshold: 0 +# } +# normalizer_spec { +# name: "identity" +# precompiled_charsmap: "" +# add_dummy_prefix: true +# remove_extra_whitespaces: false +# normalization_rule_tsv: "" +# } + +# let's now use spm_train to train this exact model +# options docs: https://github.com/google/sentencepiece/blob/master/doc/options.md + +# we'll depart on a few settings: +# character_coverage -> 1.0 + +# other important notes: +# --split-digits = true, per the paper +# --allow_whitespace_only_pieces is true, default in spm is false +# --byte_fallback is true, default in spm is false +# --normalization_rule_name is identity, default in spm is nmt_nfkc + +spm_train --input="$input" \ + --model_prefix="$model_prefix" \ + --model_type=bpe \ + --vocab_size="$vocab_size" \ + --self_test_sample_size=0 \ + --input_format="text" \ + --character_coverage=1.0 \ + --num_threads="$(nproc)" \ + --split_digits=true \ + --allow_whitespace_only_pieces=true \ + --byte_fallback=true \ + --unk_surface=" \342\201\207 " \ + --normalization_rule_name=identity \