diff --git a/tinystories.py b/tinystories.py
index 419e0d5..d41f8fc 100644
--- a/tinystories.py
+++ b/tinystories.py
@@ -9,6 +9,7 @@ import os
import random
from typing import List
from concurrent.futures import ProcessPoolExecutor
+from functools import partial
import numpy as np
import requests
@@ -37,7 +38,7 @@ def download_file(url: str, fname: str, chunk_size=1024):
def download():
- """Downloads the dataset to disk."""
+ """Downloads the TinyStories dataset to DATA_CACHE_DIR"""
os.makedirs(DATA_CACHE_DIR, exist_ok=True)
# download the TinyStories dataset, unless it's already downloaded
@@ -66,10 +67,63 @@ def download():
print(f"Number of shards: {len(shard_filenames)}")
print(f"Example story:\n{data[0]}")
+def train_vocab(vocab_size):
+ """
+ Trains a custom sentencepiece tokenizer on the TinyStories dataset.
+ The custom tokenizer files will be saved in DATA_CACHE_DIR/tok{N} directories,
+ where N is the vocab size. This is also where the pretok .bin files will go.
+ """
+ assert vocab_size > 0, "Vocab size must be positive"
-def process_shard(args):
+ # output file prefix path for sentencepiece
+ prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
+
+ # how many shards we'll use for vocab training, kept low for efficiency
+ num_shards = 10
+
+ # 1) export a large chunk of text as a single text file tiny.txt
+ tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt")
+ data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
+ shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
+
+ print(f"Writing temporary file {tiny_file} with {num_shards} shards...")
+ with open(tiny_file, "w") as of:
+ for shard in tqdm(shard_filenames[:num_shards]):
+ with open(shard, "r") as f:
+ data = json.load(f)
+ for example in data:
+ text = example["story"]
+ text = text.strip()
+ of.write(text + "\n")
+ print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")
+
+ # 2) run the train_vocab.sh script that trains the sentencepiece model
+ print("Will now train the vocab with:")
+ cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}"
+ print(cmd)
+ print("OK? [y/N] ")
+ dec = input()
+ if dec.lower() != "y":
+ print("Exiting...")
+ return
+ os.system(cmd)
+
+ # 3) optional cleanup, ask the user if they'd like to delete tiny.txt
+ dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")
+ if dec.lower() == "y":
+ os.remove(tiny_file)
+ print(f"Deleted {tiny_file}")
+
+ print(f"Trained tokenizer is in {prefix}.model")
+ print("Done.")
+
+
+def process_shard(args, vocab_size):
shard_id, shard = args
- enc = Tokenizer()
+ tokenizer_model = None
+ if vocab_size > 0:
+ tokenizer_model = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model")
+ enc = Tokenizer(tokenizer_model)
with open(shard, "r") as f:
data = json.load(f)
all_tokens = []
@@ -80,21 +134,37 @@ def process_shard(args):
all_tokens.extend(tokens)
# convert to uint16 nparray
all_tokens = np.array(all_tokens, dtype=np.uint16)
- # write to disk
- tokenized_filename = shard.replace(".json", ".bin")
+ # calculate the output filename
+ if vocab_size == 0:
+ # if we're using Llama 2, just save the tokenized file in the same dir
+ tokenized_filename = shard.replace(".json", ".bin")
+ else:
+ # save .bin files into a new tok{N} directory
+ bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
+ shard_basename = os.path.basename(shard)
+ bin_basename = shard_basename.replace(".json", ".bin")
+ tokenized_filename = os.path.join(bin_dir, bin_basename)
+ # write the bytes
with open(tokenized_filename, "wb") as f:
f.write(all_tokens.tobytes())
- print(f"Saved {tokenized_filename}")
+ # calculate the average sequence length (they are separated by BOS=1)
+ avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
+ print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}")
-def pretokenize():
+def pretokenize(vocab_size):
# iterate the shards and tokenize all of them one by one
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
+ if vocab_size > 0:
+ # .bin files will be saved into tok{N} directory, create it once here
+ bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
+ os.makedirs(bin_dir, exist_ok=True)
# process all the shards in a process pool
+ fun = partial(process_shard, vocab_size=vocab_size)
with ProcessPoolExecutor() as executor:
- executor.map(process_shard, enumerate(shard_filenames))
+ executor.map(fun, enumerate(shard_filenames))
print("Done.")
@@ -155,14 +225,29 @@ class Task:
if __name__ == "__main__":
+ """
+ These stages are designed to be run in order.
+
+ To tokenize data with the Llama 2 tokenizer:
+ python tinystories.py download
+ python tinystories.py pretokenize
+
+ To tokenize data with a custom tokenizer we train ourselves with sentencepiece, e.g.:
+ python tinystories.py download
+ python tinystories.py train_vocab --vocab_size=2048
+ python tinystories.py pretokenize --vocab_size=2048
+ """
parser = argparse.ArgumentParser()
- parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"])
+ parser.add_argument("stage", type=str, choices=["download", "pretokenize", "train_vocab"])
+ parser.add_argument("--vocab_size", type=int, default=0, help="pretokenization vocab size. 0 = use Llama 2 tokenizer.")
args = parser.parse_args()
# depending on the stage call the appropriate function
- fun = {
- "download": download,
- "pretokenize": pretokenize,
- }
- fun[args.stage]()
-
+ if args.stage == "download":
+ download()
+ elif args.stage == "train_vocab":
+ train_vocab(vocab_size=args.vocab_size)
+ elif args.stage == "pretokenize":
+ pretokenize(vocab_size=args.vocab_size)
+ else:
+ raise ValueError(f"Unknown stage {args.stage}")
diff --git a/tokenizer.py b/tokenizer.py
index 35eee20..981b2ac 100644
--- a/tokenizer.py
+++ b/tokenizer.py
@@ -10,14 +10,13 @@ from typing import List
from sentencepiece import SentencePieceProcessor
TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
-TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C
class Tokenizer:
- def __init__(self):
- model_path = TOKENIZER_MODEL
+ def __init__(self, tokenizer_model=None):
+ model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
assert os.path.isfile(model_path), model_path
self.sp_model = SentencePieceProcessor(model_file=model_path)
- #print(f"Loaded SentencePiece model from {model_path}")
+ self.model_path = model_path
# BOS / EOS token IDs
self.n_words: int = self.sp_model.vocab_size()
@@ -59,12 +58,14 @@ class Tokenizer:
tokens.append(b)
scores.append(s)
-
+
# record the max token length
max_token_length = max(len(t) for t in tokens)
# write to a binary file
- with open(TOKENIZER_BIN, 'wb') as f:
+ # the tokenizer.bin file is the same as .model file, but .bin
+ tokenizer_bin = self.model_path.replace('.model', '.bin')
+ with open(tokenizer_bin, 'wb') as f:
f.write(struct.pack("I", max_token_length))
for bytes, score in zip(tokens, scores):
f.write(struct.pack("fI", score, len(bytes)))
diff --git a/train_vocab.sh b/train_vocab.sh
new file mode 100755
index 0000000..7803af8
--- /dev/null
+++ b/train_vocab.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+# Trains a sentencepiece tokenizer model on a bunch of given data, my best
+# effort attempt to replicate how Meta trained their Llama 2 tokenizer.
+
+# usage: $ train_vocab.sh
+# example:
+# ./train_vocab.sh tiny.txt tokenizer_tiny 1024
+# requirements:
+# install https://github.com/google/sentencepiece
+
+# check if the correct number of arguments are provided
+if [ $# -ne 3 ]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+# assign command-line arguments to variables
+input=$1
+model_prefix=$2
+vocab_size=$3
+
+# check if input file exists
+if [ ! -f "$input" ]; then
+ echo "Usage: $0 "
+ echo "input '$input' not found."
+ exit 1
+fi
+
+# check if vocab_size is a positive integer
+if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -lt 1 ]; then
+ echo "Usage: $0 "
+ echo "vocab_size size must be a positive integer."
+ exit 1
+fi
+
+# Print the processed inputs
+echo "Input: $input"
+echo "Model Prefix: $model_prefix"
+echo "Vocabulary Size: $vocab_size"
+
+# train a sentencepiece tokenizer model
+# Llama 2 config can be printed as follows:
+
+# import sentencepiece.sentencepiece_model_pb2
+# mp = sentencepiece.sentencepiece_model_pb2.ModelProto()
+# mp.ParseFromString(open("tokenizer.model", "rb").read())
+# print(mp.trainer_spec)
+# print(mp.normalizer_spec)
+
+# this gives:
+
+# trainer_spec {
+# input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
+# model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
+# model_type: BPE
+# vocab_size: 32000
+# self_test_sample_size: 0
+# input_format: "text"
+# character_coverage: 0.9999499917030334
+# input_sentence_size: 200000000
+# seed_sentencepiece_size: 1000000
+# shrinking_factor: 0.75
+# num_threads: 80
+# num_sub_iterations: 2
+# max_sentence_length: 4192
+# shuffle_input_sentence: true
+# max_sentencepiece_length: 16
+# split_by_unicode_script: true
+# split_by_whitespace: true
+# split_by_number: true
+# treat_whitespace_as_suffix: false
+# split_digits: true
+# allow_whitespace_only_pieces: true
+# vocabulary_output_piece_score: true
+# hard_vocab_limit: true
+# use_all_vocab: false
+# byte_fallback: true
+# required_chars: ""
+# unk_id: 0
+# bos_id: 1
+# eos_id: 2
+# pad_id: -1
+# unk_surface: " \342\201\207 "
+# unk_piece: ""
+# bos_piece: ""
+# eos_piece: ""
+# pad_piece: ""
+# train_extremely_large_corpus: false
+# enable_differential_privacy: false
+# differential_privacy_noise_level: 0.0
+# differential_privacy_clipping_threshold: 0
+# }
+# normalizer_spec {
+# name: "identity"
+# precompiled_charsmap: ""
+# add_dummy_prefix: true
+# remove_extra_whitespaces: false
+# normalization_rule_tsv: ""
+# }
+
+# let's now use spm_train to train this exact model
+# options docs: https://github.com/google/sentencepiece/blob/master/doc/options.md
+
+# we'll depart on a few settings:
+# character_coverage -> 1.0
+
+# other important notes:
+# --split-digits = true, per the paper
+# --allow_whitespace_only_pieces is true, default in spm is false
+# --byte_fallback is true, default in spm is false
+# --normalization_rule_name is identity, default in spm is nmt_nfkc
+
+spm_train --input="$input" \
+ --model_prefix="$model_prefix" \
+ --model_type=bpe \
+ --vocab_size="$vocab_size" \
+ --self_test_sample_size=0 \
+ --input_format="text" \
+ --character_coverage=1.0 \
+ --num_threads="$(nproc)" \
+ --split_digits=true \
+ --allow_whitespace_only_pieces=true \
+ --byte_fallback=true \
+ --unk_surface=" \342\201\207 " \
+ --normalization_rule_name=identity \