add the ability to train a custom sentencepiece tokenizer with a given vocab_size, and pretok with it. some more changes still needed to merge this branch, in train.py and ofc run.c. did this in a sadly bit ugly, but fully backwards compatible way. basically when we use custom tokenizer we create a whole new directory structure for that
This commit is contained in:
+99
-14
@@ -9,6 +9,7 @@ import os
|
|||||||
import random
|
import random
|
||||||
from typing import List
|
from typing import List
|
||||||
from concurrent.futures import ProcessPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import requests
|
import requests
|
||||||
@@ -37,7 +38,7 @@ def download_file(url: str, fname: str, chunk_size=1024):
|
|||||||
|
|
||||||
|
|
||||||
def download():
|
def download():
|
||||||
"""Downloads the dataset to disk."""
|
"""Downloads the TinyStories dataset to DATA_CACHE_DIR"""
|
||||||
os.makedirs(DATA_CACHE_DIR, exist_ok=True)
|
os.makedirs(DATA_CACHE_DIR, exist_ok=True)
|
||||||
|
|
||||||
# download the TinyStories dataset, unless it's already downloaded
|
# download the TinyStories dataset, unless it's already downloaded
|
||||||
@@ -66,10 +67,63 @@ def download():
|
|||||||
print(f"Number of shards: {len(shard_filenames)}")
|
print(f"Number of shards: {len(shard_filenames)}")
|
||||||
print(f"Example story:\n{data[0]}")
|
print(f"Example story:\n{data[0]}")
|
||||||
|
|
||||||
|
def train_vocab(vocab_size):
|
||||||
|
"""
|
||||||
|
Trains a custom sentencepiece tokenizer on the TinyStories dataset.
|
||||||
|
The custom tokenizer files will be saved in DATA_CACHE_DIR/tok{N} directories,
|
||||||
|
where N is the vocab size. This is also where the pretok .bin files will go.
|
||||||
|
"""
|
||||||
|
assert vocab_size > 0, "Vocab size must be positive"
|
||||||
|
|
||||||
def process_shard(args):
|
# output file prefix path for sentencepiece
|
||||||
|
prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
|
||||||
|
|
||||||
|
# how many shards we'll use for vocab training, kept low for efficiency
|
||||||
|
num_shards = 10
|
||||||
|
|
||||||
|
# 1) export a large chunk of text as a single text file tiny.txt
|
||||||
|
tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt")
|
||||||
|
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
|
||||||
|
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
|
||||||
|
|
||||||
|
print(f"Writing temporary file {tiny_file} with {num_shards} shards...")
|
||||||
|
with open(tiny_file, "w") as of:
|
||||||
|
for shard in tqdm(shard_filenames[:num_shards]):
|
||||||
|
with open(shard, "r") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
for example in data:
|
||||||
|
text = example["story"]
|
||||||
|
text = text.strip()
|
||||||
|
of.write(text + "\n")
|
||||||
|
print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")
|
||||||
|
|
||||||
|
# 2) run the train_vocab.sh script that trains the sentencepiece model
|
||||||
|
print("Will now train the vocab with:")
|
||||||
|
cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}"
|
||||||
|
print(cmd)
|
||||||
|
print("OK? [y/N] ")
|
||||||
|
dec = input()
|
||||||
|
if dec.lower() != "y":
|
||||||
|
print("Exiting...")
|
||||||
|
return
|
||||||
|
os.system(cmd)
|
||||||
|
|
||||||
|
# 3) optional cleanup, ask the user if they'd like to delete tiny.txt
|
||||||
|
dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")
|
||||||
|
if dec.lower() == "y":
|
||||||
|
os.remove(tiny_file)
|
||||||
|
print(f"Deleted {tiny_file}")
|
||||||
|
|
||||||
|
print(f"Trained tokenizer is in {prefix}.model")
|
||||||
|
print("Done.")
|
||||||
|
|
||||||
|
|
||||||
|
def process_shard(args, vocab_size):
|
||||||
shard_id, shard = args
|
shard_id, shard = args
|
||||||
enc = Tokenizer()
|
tokenizer_model = None
|
||||||
|
if vocab_size > 0:
|
||||||
|
tokenizer_model = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model")
|
||||||
|
enc = Tokenizer(tokenizer_model)
|
||||||
with open(shard, "r") as f:
|
with open(shard, "r") as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
all_tokens = []
|
all_tokens = []
|
||||||
@@ -80,21 +134,37 @@ def process_shard(args):
|
|||||||
all_tokens.extend(tokens)
|
all_tokens.extend(tokens)
|
||||||
# convert to uint16 nparray
|
# convert to uint16 nparray
|
||||||
all_tokens = np.array(all_tokens, dtype=np.uint16)
|
all_tokens = np.array(all_tokens, dtype=np.uint16)
|
||||||
# write to disk
|
# calculate the output filename
|
||||||
|
if vocab_size == 0:
|
||||||
|
# if we're using Llama 2, just save the tokenized file in the same dir
|
||||||
tokenized_filename = shard.replace(".json", ".bin")
|
tokenized_filename = shard.replace(".json", ".bin")
|
||||||
|
else:
|
||||||
|
# save .bin files into a new tok{N} directory
|
||||||
|
bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
|
||||||
|
shard_basename = os.path.basename(shard)
|
||||||
|
bin_basename = shard_basename.replace(".json", ".bin")
|
||||||
|
tokenized_filename = os.path.join(bin_dir, bin_basename)
|
||||||
|
# write the bytes
|
||||||
with open(tokenized_filename, "wb") as f:
|
with open(tokenized_filename, "wb") as f:
|
||||||
f.write(all_tokens.tobytes())
|
f.write(all_tokens.tobytes())
|
||||||
print(f"Saved {tokenized_filename}")
|
# calculate the average sequence length (they are separated by BOS=1)
|
||||||
|
avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
|
||||||
|
print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}")
|
||||||
|
|
||||||
|
|
||||||
def pretokenize():
|
def pretokenize(vocab_size):
|
||||||
# iterate the shards and tokenize all of them one by one
|
# iterate the shards and tokenize all of them one by one
|
||||||
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
|
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
|
||||||
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
|
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
|
||||||
|
if vocab_size > 0:
|
||||||
|
# .bin files will be saved into tok{N} directory, create it once here
|
||||||
|
bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
|
||||||
|
os.makedirs(bin_dir, exist_ok=True)
|
||||||
|
|
||||||
# process all the shards in a process pool
|
# process all the shards in a process pool
|
||||||
|
fun = partial(process_shard, vocab_size=vocab_size)
|
||||||
with ProcessPoolExecutor() as executor:
|
with ProcessPoolExecutor() as executor:
|
||||||
executor.map(process_shard, enumerate(shard_filenames))
|
executor.map(fun, enumerate(shard_filenames))
|
||||||
print("Done.")
|
print("Done.")
|
||||||
|
|
||||||
|
|
||||||
@@ -155,14 +225,29 @@ class Task:
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
"""
|
||||||
|
These stages are designed to be run in order.
|
||||||
|
|
||||||
|
To tokenize data with the Llama 2 tokenizer:
|
||||||
|
python tinystories.py download
|
||||||
|
python tinystories.py pretokenize
|
||||||
|
|
||||||
|
To tokenize data with a custom tokenizer we train ourselves with sentencepiece, e.g.:
|
||||||
|
python tinystories.py download
|
||||||
|
python tinystories.py train_vocab --vocab_size=2048
|
||||||
|
python tinystories.py pretokenize --vocab_size=2048
|
||||||
|
"""
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"])
|
parser.add_argument("stage", type=str, choices=["download", "pretokenize", "train_vocab"])
|
||||||
|
parser.add_argument("--vocab_size", type=int, default=0, help="pretokenization vocab size. 0 = use Llama 2 tokenizer.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# depending on the stage call the appropriate function
|
# depending on the stage call the appropriate function
|
||||||
fun = {
|
if args.stage == "download":
|
||||||
"download": download,
|
download()
|
||||||
"pretokenize": pretokenize,
|
elif args.stage == "train_vocab":
|
||||||
}
|
train_vocab(vocab_size=args.vocab_size)
|
||||||
fun[args.stage]()
|
elif args.stage == "pretokenize":
|
||||||
|
pretokenize(vocab_size=args.vocab_size)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown stage {args.stage}")
|
||||||
|
|||||||
+6
-5
@@ -10,14 +10,13 @@ from typing import List
|
|||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
|
TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
|
||||||
TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C
|
|
||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
def __init__(self):
|
def __init__(self, tokenizer_model=None):
|
||||||
model_path = TOKENIZER_MODEL
|
model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
|
||||||
assert os.path.isfile(model_path), model_path
|
assert os.path.isfile(model_path), model_path
|
||||||
self.sp_model = SentencePieceProcessor(model_file=model_path)
|
self.sp_model = SentencePieceProcessor(model_file=model_path)
|
||||||
#print(f"Loaded SentencePiece model from {model_path}")
|
self.model_path = model_path
|
||||||
|
|
||||||
# BOS / EOS token IDs
|
# BOS / EOS token IDs
|
||||||
self.n_words: int = self.sp_model.vocab_size()
|
self.n_words: int = self.sp_model.vocab_size()
|
||||||
@@ -64,7 +63,9 @@ class Tokenizer:
|
|||||||
max_token_length = max(len(t) for t in tokens)
|
max_token_length = max(len(t) for t in tokens)
|
||||||
|
|
||||||
# write to a binary file
|
# write to a binary file
|
||||||
with open(TOKENIZER_BIN, 'wb') as f:
|
# the tokenizer.bin file is the same as .model file, but .bin
|
||||||
|
tokenizer_bin = self.model_path.replace('.model', '.bin')
|
||||||
|
with open(tokenizer_bin, 'wb') as f:
|
||||||
f.write(struct.pack("I", max_token_length))
|
f.write(struct.pack("I", max_token_length))
|
||||||
for bytes, score in zip(tokens, scores):
|
for bytes, score in zip(tokens, scores):
|
||||||
f.write(struct.pack("fI", score, len(bytes)))
|
f.write(struct.pack("fI", score, len(bytes)))
|
||||||
|
|||||||
Executable
+126
@@ -0,0 +1,126 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Trains a sentencepiece tokenizer model on a bunch of given data, my best
|
||||||
|
# effort attempt to replicate how Meta trained their Llama 2 tokenizer.
|
||||||
|
|
||||||
|
# usage: $ train_vocab.sh <input> <model_prefix> <vocab_size>
|
||||||
|
# example:
|
||||||
|
# ./train_vocab.sh tiny.txt tokenizer_tiny 1024
|
||||||
|
# requirements:
|
||||||
|
# install https://github.com/google/sentencepiece
|
||||||
|
|
||||||
|
# check if the correct number of arguments are provided
|
||||||
|
if [ $# -ne 3 ]; then
|
||||||
|
echo "Usage: $0 <input> <model_prefix> <vocab_size>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# assign command-line arguments to variables
|
||||||
|
input=$1
|
||||||
|
model_prefix=$2
|
||||||
|
vocab_size=$3
|
||||||
|
|
||||||
|
# check if input file exists
|
||||||
|
if [ ! -f "$input" ]; then
|
||||||
|
echo "Usage: $0 <input> <model_prefix> <vocab_size>"
|
||||||
|
echo "input '$input' not found."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# check if vocab_size is a positive integer
|
||||||
|
if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -lt 1 ]; then
|
||||||
|
echo "Usage: $0 <input> <model_prefix> <vocab_size>"
|
||||||
|
echo "vocab_size size must be a positive integer."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Print the processed inputs
|
||||||
|
echo "Input: $input"
|
||||||
|
echo "Model Prefix: $model_prefix"
|
||||||
|
echo "Vocabulary Size: $vocab_size"
|
||||||
|
|
||||||
|
# train a sentencepiece tokenizer model
|
||||||
|
# Llama 2 config can be printed as follows:
|
||||||
|
|
||||||
|
# import sentencepiece.sentencepiece_model_pb2
|
||||||
|
# mp = sentencepiece.sentencepiece_model_pb2.ModelProto()
|
||||||
|
# mp.ParseFromString(open("tokenizer.model", "rb").read())
|
||||||
|
# print(mp.trainer_spec)
|
||||||
|
# print(mp.normalizer_spec)
|
||||||
|
|
||||||
|
# this gives:
|
||||||
|
|
||||||
|
# trainer_spec {
|
||||||
|
# input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
|
||||||
|
# model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
|
||||||
|
# model_type: BPE
|
||||||
|
# vocab_size: 32000
|
||||||
|
# self_test_sample_size: 0
|
||||||
|
# input_format: "text"
|
||||||
|
# character_coverage: 0.9999499917030334
|
||||||
|
# input_sentence_size: 200000000
|
||||||
|
# seed_sentencepiece_size: 1000000
|
||||||
|
# shrinking_factor: 0.75
|
||||||
|
# num_threads: 80
|
||||||
|
# num_sub_iterations: 2
|
||||||
|
# max_sentence_length: 4192
|
||||||
|
# shuffle_input_sentence: true
|
||||||
|
# max_sentencepiece_length: 16
|
||||||
|
# split_by_unicode_script: true
|
||||||
|
# split_by_whitespace: true
|
||||||
|
# split_by_number: true
|
||||||
|
# treat_whitespace_as_suffix: false
|
||||||
|
# split_digits: true
|
||||||
|
# allow_whitespace_only_pieces: true
|
||||||
|
# vocabulary_output_piece_score: true
|
||||||
|
# hard_vocab_limit: true
|
||||||
|
# use_all_vocab: false
|
||||||
|
# byte_fallback: true
|
||||||
|
# required_chars: ""
|
||||||
|
# unk_id: 0
|
||||||
|
# bos_id: 1
|
||||||
|
# eos_id: 2
|
||||||
|
# pad_id: -1
|
||||||
|
# unk_surface: " \342\201\207 "
|
||||||
|
# unk_piece: "<unk>"
|
||||||
|
# bos_piece: "<s>"
|
||||||
|
# eos_piece: "</s>"
|
||||||
|
# pad_piece: "<pad>"
|
||||||
|
# train_extremely_large_corpus: false
|
||||||
|
# enable_differential_privacy: false
|
||||||
|
# differential_privacy_noise_level: 0.0
|
||||||
|
# differential_privacy_clipping_threshold: 0
|
||||||
|
# }
|
||||||
|
# normalizer_spec {
|
||||||
|
# name: "identity"
|
||||||
|
# precompiled_charsmap: ""
|
||||||
|
# add_dummy_prefix: true
|
||||||
|
# remove_extra_whitespaces: false
|
||||||
|
# normalization_rule_tsv: ""
|
||||||
|
# }
|
||||||
|
|
||||||
|
# let's now use spm_train to train this exact model
|
||||||
|
# options docs: https://github.com/google/sentencepiece/blob/master/doc/options.md
|
||||||
|
|
||||||
|
# we'll depart on a few settings:
|
||||||
|
# character_coverage -> 1.0
|
||||||
|
|
||||||
|
# other important notes:
|
||||||
|
# --split-digits = true, per the paper
|
||||||
|
# --allow_whitespace_only_pieces is true, default in spm is false
|
||||||
|
# --byte_fallback is true, default in spm is false
|
||||||
|
# --normalization_rule_name is identity, default in spm is nmt_nfkc
|
||||||
|
|
||||||
|
spm_train --input="$input" \
|
||||||
|
--model_prefix="$model_prefix" \
|
||||||
|
--model_type=bpe \
|
||||||
|
--vocab_size="$vocab_size" \
|
||||||
|
--self_test_sample_size=0 \
|
||||||
|
--input_format="text" \
|
||||||
|
--character_coverage=1.0 \
|
||||||
|
--num_threads="$(nproc)" \
|
||||||
|
--split_digits=true \
|
||||||
|
--allow_whitespace_only_pieces=true \
|
||||||
|
--byte_fallback=true \
|
||||||
|
--unk_surface=" \342\201\207 " \
|
||||||
|
--normalization_rule_name=identity \
|
||||||
Reference in New Issue
Block a user