add the ability to train a custom sentencepiece tokenizer with a given vocab_size, and pretok with it. some more changes still needed to merge this branch, in train.py and ofc run.c. did this in a sadly bit ugly, but fully backwards compatible way. basically when we use custom tokenizer we create a whole new directory structure for that
This commit is contained in:
+100
-15
@@ -9,6 +9,7 @@ import os
|
||||
import random
|
||||
from typing import List
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
@@ -37,7 +38,7 @@ def download_file(url: str, fname: str, chunk_size=1024):
|
||||
|
||||
|
||||
def download():
|
||||
"""Downloads the dataset to disk."""
|
||||
"""Downloads the TinyStories dataset to DATA_CACHE_DIR"""
|
||||
os.makedirs(DATA_CACHE_DIR, exist_ok=True)
|
||||
|
||||
# download the TinyStories dataset, unless it's already downloaded
|
||||
@@ -66,10 +67,63 @@ def download():
|
||||
print(f"Number of shards: {len(shard_filenames)}")
|
||||
print(f"Example story:\n{data[0]}")
|
||||
|
||||
def train_vocab(vocab_size):
|
||||
"""
|
||||
Trains a custom sentencepiece tokenizer on the TinyStories dataset.
|
||||
The custom tokenizer files will be saved in DATA_CACHE_DIR/tok{N} directories,
|
||||
where N is the vocab size. This is also where the pretok .bin files will go.
|
||||
"""
|
||||
assert vocab_size > 0, "Vocab size must be positive"
|
||||
|
||||
def process_shard(args):
|
||||
# output file prefix path for sentencepiece
|
||||
prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
|
||||
|
||||
# how many shards we'll use for vocab training, kept low for efficiency
|
||||
num_shards = 10
|
||||
|
||||
# 1) export a large chunk of text as a single text file tiny.txt
|
||||
tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt")
|
||||
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
|
||||
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
|
||||
|
||||
print(f"Writing temporary file {tiny_file} with {num_shards} shards...")
|
||||
with open(tiny_file, "w") as of:
|
||||
for shard in tqdm(shard_filenames[:num_shards]):
|
||||
with open(shard, "r") as f:
|
||||
data = json.load(f)
|
||||
for example in data:
|
||||
text = example["story"]
|
||||
text = text.strip()
|
||||
of.write(text + "\n")
|
||||
print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")
|
||||
|
||||
# 2) run the train_vocab.sh script that trains the sentencepiece model
|
||||
print("Will now train the vocab with:")
|
||||
cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}"
|
||||
print(cmd)
|
||||
print("OK? [y/N] ")
|
||||
dec = input()
|
||||
if dec.lower() != "y":
|
||||
print("Exiting...")
|
||||
return
|
||||
os.system(cmd)
|
||||
|
||||
# 3) optional cleanup, ask the user if they'd like to delete tiny.txt
|
||||
dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")
|
||||
if dec.lower() == "y":
|
||||
os.remove(tiny_file)
|
||||
print(f"Deleted {tiny_file}")
|
||||
|
||||
print(f"Trained tokenizer is in {prefix}.model")
|
||||
print("Done.")
|
||||
|
||||
|
||||
def process_shard(args, vocab_size):
|
||||
shard_id, shard = args
|
||||
enc = Tokenizer()
|
||||
tokenizer_model = None
|
||||
if vocab_size > 0:
|
||||
tokenizer_model = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model")
|
||||
enc = Tokenizer(tokenizer_model)
|
||||
with open(shard, "r") as f:
|
||||
data = json.load(f)
|
||||
all_tokens = []
|
||||
@@ -80,21 +134,37 @@ def process_shard(args):
|
||||
all_tokens.extend(tokens)
|
||||
# convert to uint16 nparray
|
||||
all_tokens = np.array(all_tokens, dtype=np.uint16)
|
||||
# write to disk
|
||||
tokenized_filename = shard.replace(".json", ".bin")
|
||||
# calculate the output filename
|
||||
if vocab_size == 0:
|
||||
# if we're using Llama 2, just save the tokenized file in the same dir
|
||||
tokenized_filename = shard.replace(".json", ".bin")
|
||||
else:
|
||||
# save .bin files into a new tok{N} directory
|
||||
bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
|
||||
shard_basename = os.path.basename(shard)
|
||||
bin_basename = shard_basename.replace(".json", ".bin")
|
||||
tokenized_filename = os.path.join(bin_dir, bin_basename)
|
||||
# write the bytes
|
||||
with open(tokenized_filename, "wb") as f:
|
||||
f.write(all_tokens.tobytes())
|
||||
print(f"Saved {tokenized_filename}")
|
||||
# calculate the average sequence length (they are separated by BOS=1)
|
||||
avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
|
||||
print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}")
|
||||
|
||||
|
||||
def pretokenize():
|
||||
def pretokenize(vocab_size):
|
||||
# iterate the shards and tokenize all of them one by one
|
||||
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
|
||||
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
|
||||
if vocab_size > 0:
|
||||
# .bin files will be saved into tok{N} directory, create it once here
|
||||
bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
|
||||
os.makedirs(bin_dir, exist_ok=True)
|
||||
|
||||
# process all the shards in a process pool
|
||||
fun = partial(process_shard, vocab_size=vocab_size)
|
||||
with ProcessPoolExecutor() as executor:
|
||||
executor.map(process_shard, enumerate(shard_filenames))
|
||||
executor.map(fun, enumerate(shard_filenames))
|
||||
print("Done.")
|
||||
|
||||
|
||||
@@ -155,14 +225,29 @@ class Task:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
These stages are designed to be run in order.
|
||||
|
||||
To tokenize data with the Llama 2 tokenizer:
|
||||
python tinystories.py download
|
||||
python tinystories.py pretokenize
|
||||
|
||||
To tokenize data with a custom tokenizer we train ourselves with sentencepiece, e.g.:
|
||||
python tinystories.py download
|
||||
python tinystories.py train_vocab --vocab_size=2048
|
||||
python tinystories.py pretokenize --vocab_size=2048
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"])
|
||||
parser.add_argument("stage", type=str, choices=["download", "pretokenize", "train_vocab"])
|
||||
parser.add_argument("--vocab_size", type=int, default=0, help="pretokenization vocab size. 0 = use Llama 2 tokenizer.")
|
||||
args = parser.parse_args()
|
||||
|
||||
# depending on the stage call the appropriate function
|
||||
fun = {
|
||||
"download": download,
|
||||
"pretokenize": pretokenize,
|
||||
}
|
||||
fun[args.stage]()
|
||||
|
||||
if args.stage == "download":
|
||||
download()
|
||||
elif args.stage == "train_vocab":
|
||||
train_vocab(vocab_size=args.vocab_size)
|
||||
elif args.stage == "pretokenize":
|
||||
pretokenize(vocab_size=args.vocab_size)
|
||||
else:
|
||||
raise ValueError(f"Unknown stage {args.stage}")
|
||||
|
||||
+7
-6
@@ -10,14 +10,13 @@ from typing import List
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
|
||||
TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C
|
||||
|
||||
class Tokenizer:
|
||||
def __init__(self):
|
||||
model_path = TOKENIZER_MODEL
|
||||
def __init__(self, tokenizer_model=None):
|
||||
model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
|
||||
assert os.path.isfile(model_path), model_path
|
||||
self.sp_model = SentencePieceProcessor(model_file=model_path)
|
||||
#print(f"Loaded SentencePiece model from {model_path}")
|
||||
self.model_path = model_path
|
||||
|
||||
# BOS / EOS token IDs
|
||||
self.n_words: int = self.sp_model.vocab_size()
|
||||
@@ -59,12 +58,14 @@ class Tokenizer:
|
||||
|
||||
tokens.append(b)
|
||||
scores.append(s)
|
||||
|
||||
|
||||
# record the max token length
|
||||
max_token_length = max(len(t) for t in tokens)
|
||||
|
||||
# write to a binary file
|
||||
with open(TOKENIZER_BIN, 'wb') as f:
|
||||
# the tokenizer.bin file is the same as .model file, but .bin
|
||||
tokenizer_bin = self.model_path.replace('.model', '.bin')
|
||||
with open(tokenizer_bin, 'wb') as f:
|
||||
f.write(struct.pack("I", max_token_length))
|
||||
for bytes, score in zip(tokens, scores):
|
||||
f.write(struct.pack("fI", score, len(bytes)))
|
||||
|
||||
Executable
+126
@@ -0,0 +1,126 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Trains a sentencepiece tokenizer model on a bunch of given data, my best
|
||||
# effort attempt to replicate how Meta trained their Llama 2 tokenizer.
|
||||
|
||||
# usage: $ train_vocab.sh <input> <model_prefix> <vocab_size>
|
||||
# example:
|
||||
# ./train_vocab.sh tiny.txt tokenizer_tiny 1024
|
||||
# requirements:
|
||||
# install https://github.com/google/sentencepiece
|
||||
|
||||
# check if the correct number of arguments are provided
|
||||
if [ $# -ne 3 ]; then
|
||||
echo "Usage: $0 <input> <model_prefix> <vocab_size>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# assign command-line arguments to variables
|
||||
input=$1
|
||||
model_prefix=$2
|
||||
vocab_size=$3
|
||||
|
||||
# check if input file exists
|
||||
if [ ! -f "$input" ]; then
|
||||
echo "Usage: $0 <input> <model_prefix> <vocab_size>"
|
||||
echo "input '$input' not found."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check if vocab_size is a positive integer
|
||||
if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -lt 1 ]; then
|
||||
echo "Usage: $0 <input> <model_prefix> <vocab_size>"
|
||||
echo "vocab_size size must be a positive integer."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Print the processed inputs
|
||||
echo "Input: $input"
|
||||
echo "Model Prefix: $model_prefix"
|
||||
echo "Vocabulary Size: $vocab_size"
|
||||
|
||||
# train a sentencepiece tokenizer model
|
||||
# Llama 2 config can be printed as follows:
|
||||
|
||||
# import sentencepiece.sentencepiece_model_pb2
|
||||
# mp = sentencepiece.sentencepiece_model_pb2.ModelProto()
|
||||
# mp.ParseFromString(open("tokenizer.model", "rb").read())
|
||||
# print(mp.trainer_spec)
|
||||
# print(mp.normalizer_spec)
|
||||
|
||||
# this gives:
|
||||
|
||||
# trainer_spec {
|
||||
# input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
|
||||
# model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
|
||||
# model_type: BPE
|
||||
# vocab_size: 32000
|
||||
# self_test_sample_size: 0
|
||||
# input_format: "text"
|
||||
# character_coverage: 0.9999499917030334
|
||||
# input_sentence_size: 200000000
|
||||
# seed_sentencepiece_size: 1000000
|
||||
# shrinking_factor: 0.75
|
||||
# num_threads: 80
|
||||
# num_sub_iterations: 2
|
||||
# max_sentence_length: 4192
|
||||
# shuffle_input_sentence: true
|
||||
# max_sentencepiece_length: 16
|
||||
# split_by_unicode_script: true
|
||||
# split_by_whitespace: true
|
||||
# split_by_number: true
|
||||
# treat_whitespace_as_suffix: false
|
||||
# split_digits: true
|
||||
# allow_whitespace_only_pieces: true
|
||||
# vocabulary_output_piece_score: true
|
||||
# hard_vocab_limit: true
|
||||
# use_all_vocab: false
|
||||
# byte_fallback: true
|
||||
# required_chars: ""
|
||||
# unk_id: 0
|
||||
# bos_id: 1
|
||||
# eos_id: 2
|
||||
# pad_id: -1
|
||||
# unk_surface: " \342\201\207 "
|
||||
# unk_piece: "<unk>"
|
||||
# bos_piece: "<s>"
|
||||
# eos_piece: "</s>"
|
||||
# pad_piece: "<pad>"
|
||||
# train_extremely_large_corpus: false
|
||||
# enable_differential_privacy: false
|
||||
# differential_privacy_noise_level: 0.0
|
||||
# differential_privacy_clipping_threshold: 0
|
||||
# }
|
||||
# normalizer_spec {
|
||||
# name: "identity"
|
||||
# precompiled_charsmap: ""
|
||||
# add_dummy_prefix: true
|
||||
# remove_extra_whitespaces: false
|
||||
# normalization_rule_tsv: ""
|
||||
# }
|
||||
|
||||
# let's now use spm_train to train this exact model
|
||||
# options docs: https://github.com/google/sentencepiece/blob/master/doc/options.md
|
||||
|
||||
# we'll depart on a few settings:
|
||||
# character_coverage -> 1.0
|
||||
|
||||
# other important notes:
|
||||
# --split-digits = true, per the paper
|
||||
# --allow_whitespace_only_pieces is true, default in spm is false
|
||||
# --byte_fallback is true, default in spm is false
|
||||
# --normalization_rule_name is identity, default in spm is nmt_nfkc
|
||||
|
||||
spm_train --input="$input" \
|
||||
--model_prefix="$model_prefix" \
|
||||
--model_type=bpe \
|
||||
--vocab_size="$vocab_size" \
|
||||
--self_test_sample_size=0 \
|
||||
--input_format="text" \
|
||||
--character_coverage=1.0 \
|
||||
--num_threads="$(nproc)" \
|
||||
--split_digits=true \
|
||||
--allow_whitespace_only_pieces=true \
|
||||
--byte_fallback=true \
|
||||
--unk_surface=" \342\201\207 " \
|
||||
--normalization_rule_name=identity \
|
||||
Reference in New Issue
Block a user