add the ability to train a custom sentencepiece tokenizer with a given vocab_size, and pretok with it. some more changes still needed to merge this branch, in train.py and ofc run.c. did this in a sadly bit ugly, but fully backwards compatible way. basically when we use custom tokenizer we create a whole new directory structure for that

This commit is contained in:
Andrej Karpathy
2023-08-11 03:58:22 +00:00
parent c42641205f
commit 4c6f0af9ff
3 changed files with 233 additions and 21 deletions
+100 -15
View File
@@ -9,6 +9,7 @@ import os
import random import random
from typing import List from typing import List
from concurrent.futures import ProcessPoolExecutor from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np import numpy as np
import requests import requests
@@ -37,7 +38,7 @@ def download_file(url: str, fname: str, chunk_size=1024):
def download(): def download():
"""Downloads the dataset to disk.""" """Downloads the TinyStories dataset to DATA_CACHE_DIR"""
os.makedirs(DATA_CACHE_DIR, exist_ok=True) os.makedirs(DATA_CACHE_DIR, exist_ok=True)
# download the TinyStories dataset, unless it's already downloaded # download the TinyStories dataset, unless it's already downloaded
@@ -66,10 +67,63 @@ def download():
print(f"Number of shards: {len(shard_filenames)}") print(f"Number of shards: {len(shard_filenames)}")
print(f"Example story:\n{data[0]}") print(f"Example story:\n{data[0]}")
def train_vocab(vocab_size):
"""
Trains a custom sentencepiece tokenizer on the TinyStories dataset.
The custom tokenizer files will be saved in DATA_CACHE_DIR/tok{N} directories,
where N is the vocab size. This is also where the pretok .bin files will go.
"""
assert vocab_size > 0, "Vocab size must be positive"
def process_shard(args): # output file prefix path for sentencepiece
prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
# how many shards we'll use for vocab training, kept low for efficiency
num_shards = 10
# 1) export a large chunk of text as a single text file tiny.txt
tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt")
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
print(f"Writing temporary file {tiny_file} with {num_shards} shards...")
with open(tiny_file, "w") as of:
for shard in tqdm(shard_filenames[:num_shards]):
with open(shard, "r") as f:
data = json.load(f)
for example in data:
text = example["story"]
text = text.strip()
of.write(text + "\n")
print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")
# 2) run the train_vocab.sh script that trains the sentencepiece model
print("Will now train the vocab with:")
cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}"
print(cmd)
print("OK? [y/N] ")
dec = input()
if dec.lower() != "y":
print("Exiting...")
return
os.system(cmd)
# 3) optional cleanup, ask the user if they'd like to delete tiny.txt
dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")
if dec.lower() == "y":
os.remove(tiny_file)
print(f"Deleted {tiny_file}")
print(f"Trained tokenizer is in {prefix}.model")
print("Done.")
def process_shard(args, vocab_size):
shard_id, shard = args shard_id, shard = args
enc = Tokenizer() tokenizer_model = None
if vocab_size > 0:
tokenizer_model = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model")
enc = Tokenizer(tokenizer_model)
with open(shard, "r") as f: with open(shard, "r") as f:
data = json.load(f) data = json.load(f)
all_tokens = [] all_tokens = []
@@ -80,21 +134,37 @@ def process_shard(args):
all_tokens.extend(tokens) all_tokens.extend(tokens)
# convert to uint16 nparray # convert to uint16 nparray
all_tokens = np.array(all_tokens, dtype=np.uint16) all_tokens = np.array(all_tokens, dtype=np.uint16)
# write to disk # calculate the output filename
tokenized_filename = shard.replace(".json", ".bin") if vocab_size == 0:
# if we're using Llama 2, just save the tokenized file in the same dir
tokenized_filename = shard.replace(".json", ".bin")
else:
# save .bin files into a new tok{N} directory
bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
shard_basename = os.path.basename(shard)
bin_basename = shard_basename.replace(".json", ".bin")
tokenized_filename = os.path.join(bin_dir, bin_basename)
# write the bytes
with open(tokenized_filename, "wb") as f: with open(tokenized_filename, "wb") as f:
f.write(all_tokens.tobytes()) f.write(all_tokens.tobytes())
print(f"Saved {tokenized_filename}") # calculate the average sequence length (they are separated by BOS=1)
avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}")
def pretokenize(): def pretokenize(vocab_size):
# iterate the shards and tokenize all of them one by one # iterate the shards and tokenize all of them one by one
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json"))) shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
if vocab_size > 0:
# .bin files will be saved into tok{N} directory, create it once here
bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
os.makedirs(bin_dir, exist_ok=True)
# process all the shards in a process pool # process all the shards in a process pool
fun = partial(process_shard, vocab_size=vocab_size)
with ProcessPoolExecutor() as executor: with ProcessPoolExecutor() as executor:
executor.map(process_shard, enumerate(shard_filenames)) executor.map(fun, enumerate(shard_filenames))
print("Done.") print("Done.")
@@ -155,14 +225,29 @@ class Task:
if __name__ == "__main__": if __name__ == "__main__":
"""
These stages are designed to be run in order.
To tokenize data with the Llama 2 tokenizer:
python tinystories.py download
python tinystories.py pretokenize
To tokenize data with a custom tokenizer we train ourselves with sentencepiece, e.g.:
python tinystories.py download
python tinystories.py train_vocab --vocab_size=2048
python tinystories.py pretokenize --vocab_size=2048
"""
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"]) parser.add_argument("stage", type=str, choices=["download", "pretokenize", "train_vocab"])
parser.add_argument("--vocab_size", type=int, default=0, help="pretokenization vocab size. 0 = use Llama 2 tokenizer.")
args = parser.parse_args() args = parser.parse_args()
# depending on the stage call the appropriate function # depending on the stage call the appropriate function
fun = { if args.stage == "download":
"download": download, download()
"pretokenize": pretokenize, elif args.stage == "train_vocab":
} train_vocab(vocab_size=args.vocab_size)
fun[args.stage]() elif args.stage == "pretokenize":
pretokenize(vocab_size=args.vocab_size)
else:
raise ValueError(f"Unknown stage {args.stage}")
+6 -5
View File
@@ -10,14 +10,13 @@ from typing import List
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C
class Tokenizer: class Tokenizer:
def __init__(self): def __init__(self, tokenizer_model=None):
model_path = TOKENIZER_MODEL model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
assert os.path.isfile(model_path), model_path assert os.path.isfile(model_path), model_path
self.sp_model = SentencePieceProcessor(model_file=model_path) self.sp_model = SentencePieceProcessor(model_file=model_path)
#print(f"Loaded SentencePiece model from {model_path}") self.model_path = model_path
# BOS / EOS token IDs # BOS / EOS token IDs
self.n_words: int = self.sp_model.vocab_size() self.n_words: int = self.sp_model.vocab_size()
@@ -64,7 +63,9 @@ class Tokenizer:
max_token_length = max(len(t) for t in tokens) max_token_length = max(len(t) for t in tokens)
# write to a binary file # write to a binary file
with open(TOKENIZER_BIN, 'wb') as f: # the tokenizer.bin file is the same as .model file, but .bin
tokenizer_bin = self.model_path.replace('.model', '.bin')
with open(tokenizer_bin, 'wb') as f:
f.write(struct.pack("I", max_token_length)) f.write(struct.pack("I", max_token_length))
for bytes, score in zip(tokens, scores): for bytes, score in zip(tokens, scores):
f.write(struct.pack("fI", score, len(bytes))) f.write(struct.pack("fI", score, len(bytes)))
Executable
+126
View File
@@ -0,0 +1,126 @@
#!/bin/bash
# Trains a sentencepiece tokenizer model on a bunch of given data, my best
# effort attempt to replicate how Meta trained their Llama 2 tokenizer.
# usage: $ train_vocab.sh <input> <model_prefix> <vocab_size>
# example:
# ./train_vocab.sh tiny.txt tokenizer_tiny 1024
# requirements:
# install https://github.com/google/sentencepiece
# check if the correct number of arguments are provided
if [ $# -ne 3 ]; then
echo "Usage: $0 <input> <model_prefix> <vocab_size>"
exit 1
fi
# assign command-line arguments to variables
input=$1
model_prefix=$2
vocab_size=$3
# check if input file exists
if [ ! -f "$input" ]; then
echo "Usage: $0 <input> <model_prefix> <vocab_size>"
echo "input '$input' not found."
exit 1
fi
# check if vocab_size is a positive integer
if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -lt 1 ]; then
echo "Usage: $0 <input> <model_prefix> <vocab_size>"
echo "vocab_size size must be a positive integer."
exit 1
fi
# Print the processed inputs
echo "Input: $input"
echo "Model Prefix: $model_prefix"
echo "Vocabulary Size: $vocab_size"
# train a sentencepiece tokenizer model
# Llama 2 config can be printed as follows:
# import sentencepiece.sentencepiece_model_pb2
# mp = sentencepiece.sentencepiece_model_pb2.ModelProto()
# mp.ParseFromString(open("tokenizer.model", "rb").read())
# print(mp.trainer_spec)
# print(mp.normalizer_spec)
# this gives:
# trainer_spec {
# input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
# model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
# model_type: BPE
# vocab_size: 32000
# self_test_sample_size: 0
# input_format: "text"
# character_coverage: 0.9999499917030334
# input_sentence_size: 200000000
# seed_sentencepiece_size: 1000000
# shrinking_factor: 0.75
# num_threads: 80
# num_sub_iterations: 2
# max_sentence_length: 4192
# shuffle_input_sentence: true
# max_sentencepiece_length: 16
# split_by_unicode_script: true
# split_by_whitespace: true
# split_by_number: true
# treat_whitespace_as_suffix: false
# split_digits: true
# allow_whitespace_only_pieces: true
# vocabulary_output_piece_score: true
# hard_vocab_limit: true
# use_all_vocab: false
# byte_fallback: true
# required_chars: ""
# unk_id: 0
# bos_id: 1
# eos_id: 2
# pad_id: -1
# unk_surface: " \342\201\207 "
# unk_piece: "<unk>"
# bos_piece: "<s>"
# eos_piece: "</s>"
# pad_piece: "<pad>"
# train_extremely_large_corpus: false
# enable_differential_privacy: false
# differential_privacy_noise_level: 0.0
# differential_privacy_clipping_threshold: 0
# }
# normalizer_spec {
# name: "identity"
# precompiled_charsmap: ""
# add_dummy_prefix: true
# remove_extra_whitespaces: false
# normalization_rule_tsv: ""
# }
# let's now use spm_train to train this exact model
# options docs: https://github.com/google/sentencepiece/blob/master/doc/options.md
# we'll depart on a few settings:
# character_coverage -> 1.0
# other important notes:
# --split-digits = true, per the paper
# --allow_whitespace_only_pieces is true, default in spm is false
# --byte_fallback is true, default in spm is false
# --normalization_rule_name is identity, default in spm is nmt_nfkc
spm_train --input="$input" \
--model_prefix="$model_prefix" \
--model_type=bpe \
--vocab_size="$vocab_size" \
--self_test_sample_size=0 \
--input_format="text" \
--character_coverage=1.0 \
--num_threads="$(nproc)" \
--split_digits=true \
--allow_whitespace_only_pieces=true \
--byte_fallback=true \
--unk_surface=" \342\201\207 " \
--normalization_rule_name=identity \