From c42641205ffe17871af3464f35f51b201e58ebeb Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 10 Aug 2023 15:23:05 +0000
Subject: [PATCH 01/30] turn off topp sampling by default because it is a bit
 too slow to be the default. it is likely that turning it on, e.g. -p 0.9 is
 midlly higher quality and safer samples, but this comes at a cost of too much
 performance in double digit percent sometimes, for it to be on by default i
 think...

---
 README.md | 4 +++-
 run.c     | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/README.md b/README.md
index 0721c19..be6d8d9 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,9 @@ You can also prompt the model with a prefix or a number of additional command li
 
 > One day, Lily met a Shoggoth. He was very shy, but was also very generous. Lily said “Hello Shoggy! Can I be your friend?” Shoggy was happy to have a friend and said “Yes, let’s explore the universe together!” So they set off on a journey to explore the universe. As they travelled, Shoggy was happy to explain to Lily about all the wonderful things in the universe. At the end of the day, Lily and Shoggy had gathered lots of wonderful things from the universe, and they both felt very proud. They promised to explore the universe as one big pair and to never stop being generous to each other.
 
-There is also an even better 110M param model available, see [models](#models). Quick note on sampling, the recommendation for good results is to use `-t 1.0 -p 0.9`, i.e. top-p sampling at 0.9 with temperature 1.0 (this is the default). To control the diversity of samples use either the temperature (i.e. vary `-t` between 0 and 1 and keep top-p off with `-p 0`) or the top-p value (i.e. vary `-p` between 0 and 1 and keep `-t 1`), but not both. Nice explainers on LLM sampling strategies include [this](https://peterchng.com/blog/2023/05/02/token-selection-strategies-top-k-top-p-and-temperature/), [this](https://docs.cohere.com/docs/controlling-generation-with-top-k-top-p) or [this](https://huggingface.co/blog/how-to-generate).
+There is also an even better 110M param model available, see [models](#models).
+
+Quick note on sampling, the recommendation for ~best results is to sample with `-t 1.0 -p 0.9`, i.e. temperature 1.0 (default) but also top-p sampling at 0.9 (not default!). The top-p sampling is turned off by default because it can run quite a bit slower. More generally, to control the diversity of samples use either the temperature (i.e. vary `-t` between 0 and 1 and keep top-p off with `-p 0`) or the top-p value (i.e. vary `-p` between 0 and 1 and keep `-t 1`), but not both. Nice explainers on LLM sampling strategies include [this](https://peterchng.com/blog/2023/05/02/token-selection-strategies-top-k-top-p-and-temperature/), [this](https://docs.cohere.com/docs/controlling-generation-with-top-k-top-p) or [this](https://huggingface.co/blog/how-to-generate).
 
 ## Meta's Llama 2 models
 
diff --git a/run.c b/run.c
index 9f4a1b2..afe695f 100644
--- a/run.c
+++ b/run.c
@@ -504,7 +504,7 @@ void error_usage() {
     fprintf(stderr, "Example: run model.bin -n 256 -i \"Once upon a time\"\n");
     fprintf(stderr, "Options:\n");
     fprintf(stderr, "  -t <float>  temperature, default 1.0\n");
-    fprintf(stderr, "  -p <float>  p value in top-p (nucleus) sampling. default 0.9, 0 = off\n");
+    fprintf(stderr, "  -p <float>  p value in top-p (nucleus) sampling. default 1.0 (=off)\n");
     fprintf(stderr, "  -s <int>    random seed, default time(NULL)\n");
     fprintf(stderr, "  -n <int>    number of steps to run for, default 256. 0 = max_seq_len\n");
     fprintf(stderr, "  -i <string> input prompt\n");
@@ -516,7 +516,7 @@ int main(int argc, char *argv[]) {
     // default inits
     char *checkpoint = NULL;  // e.g. out/model.bin
     float temperature = 1.0f; // 0.0 = greedy deterministic. 1.0 = original. don't set higher
-    float topp = 0.9f;        // top-p in nucleus sampling
+    float topp = 1.0f;        // top-p in nucleus sampling. 1.0 = off. 0.9 works well, but slower
     rng_seed = 0; // seed rng with time by default
     int steps = 256;          // number of steps to run for
     char *prompt = NULL;      // prompt string
@@ -623,7 +623,7 @@ int main(int argc, char *argv[]) {
                 // apply softmax to the logits to get the probabilities for next token
                 softmax(state.logits, config.vocab_size);
                 // we sample from this distribution to get the next token
-                if (topp <= 0) {
+                if (topp <= 0 || topp >= 1) {
                     // simply sample from the predicted probability distribution
                     next = sample(state.logits, config.vocab_size);
                 } else {

From 4c6f0af9ff3671b0b8053c6a3a512a06bad5c676 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Fri, 11 Aug 2023 03:58:22 +0000
Subject: [PATCH 02/30] add the ability to train a custom sentencepiece
 tokenizer with a given vocab_size, and pretok with it. some more changes
 still needed to merge this branch, in train.py and ofc run.c. did this in a
 sadly bit ugly, but fully backwards compatible way. basically when we use
 custom tokenizer we create a whole new directory structure for that

---
 tinystories.py | 115 ++++++++++++++++++++++++++++++++++++++------
 tokenizer.py   |  13 ++---
 train_vocab.sh | 126 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 233 insertions(+), 21 deletions(-)
 create mode 100755 train_vocab.sh

diff --git a/tinystories.py b/tinystories.py
index 419e0d5..d41f8fc 100644
--- a/tinystories.py
+++ b/tinystories.py
@@ -9,6 +9,7 @@ import os
 import random
 from typing import List
 from concurrent.futures import ProcessPoolExecutor
+from functools import partial
 
 import numpy as np
 import requests
@@ -37,7 +38,7 @@ def download_file(url: str, fname: str, chunk_size=1024):
 
 
 def download():
-    """Downloads the dataset to disk."""
+    """Downloads the TinyStories dataset to DATA_CACHE_DIR"""
     os.makedirs(DATA_CACHE_DIR, exist_ok=True)
 
     # download the TinyStories dataset, unless it's already downloaded
@@ -66,10 +67,63 @@ def download():
     print(f"Number of shards: {len(shard_filenames)}")
     print(f"Example story:\n{data[0]}")
 
+def train_vocab(vocab_size):
+    """
+    Trains a custom sentencepiece tokenizer on the TinyStories dataset.
+    The custom tokenizer files will be saved in DATA_CACHE_DIR/tok{N} directories,
+    where N is the vocab size. This is also where the pretok .bin files will go.
+    """
+    assert vocab_size > 0, "Vocab size must be positive"
 
-def process_shard(args):
+    # output file prefix path for sentencepiece
+    prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
+
+    # how many shards we'll use for vocab training, kept low for efficiency
+    num_shards = 10
+
+    # 1) export a large chunk of text as a single text file tiny.txt
+    tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt")
+    data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
+    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
+
+    print(f"Writing temporary file {tiny_file} with {num_shards} shards...")
+    with open(tiny_file, "w") as of:
+        for shard in tqdm(shard_filenames[:num_shards]):
+            with open(shard, "r") as f:
+                data = json.load(f)
+            for example in data:
+                text = example["story"]
+                text = text.strip()
+                of.write(text + "\n")
+    print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")
+
+    # 2) run the train_vocab.sh script that trains the sentencepiece model
+    print("Will now train the vocab with:")
+    cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}"
+    print(cmd)
+    print("OK? [y/N] ")
+    dec = input()
+    if dec.lower() != "y":
+        print("Exiting...")
+        return
+    os.system(cmd)
+
+    # 3) optional cleanup, ask the user if they'd like to delete tiny.txt
+    dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")
+    if dec.lower() == "y":
+        os.remove(tiny_file)
+        print(f"Deleted {tiny_file}")
+
+    print(f"Trained tokenizer is in {prefix}.model")
+    print("Done.")
+
+
+def process_shard(args, vocab_size):
     shard_id, shard = args
-    enc = Tokenizer()
+    tokenizer_model = None
+    if vocab_size > 0:
+        tokenizer_model = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model")
+    enc = Tokenizer(tokenizer_model)
     with open(shard, "r") as f:
         data = json.load(f)
     all_tokens = []
@@ -80,21 +134,37 @@ def process_shard(args):
         all_tokens.extend(tokens)
     # convert to uint16 nparray
     all_tokens = np.array(all_tokens, dtype=np.uint16)
-    # write to disk
-    tokenized_filename = shard.replace(".json", ".bin")
+    # calculate the output filename
+    if vocab_size == 0:
+        # if we're using Llama 2, just save the tokenized file in the same dir
+        tokenized_filename = shard.replace(".json", ".bin")
+    else:
+        # save .bin files into a new tok{N} directory
+        bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
+        shard_basename = os.path.basename(shard)
+        bin_basename = shard_basename.replace(".json", ".bin")
+        tokenized_filename = os.path.join(bin_dir, bin_basename)
+    # write the bytes
     with open(tokenized_filename, "wb") as f:
         f.write(all_tokens.tobytes())
-    print(f"Saved {tokenized_filename}")
+    # calculate the average sequence length (they are separated by BOS=1)
+    avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
+    print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}")
 
 
-def pretokenize():
+def pretokenize(vocab_size):
     # iterate the shards and tokenize all of them one by one
     data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
     shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
+    if vocab_size > 0:
+        # .bin files will be saved into tok{N} directory, create it once here
+        bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
+        os.makedirs(bin_dir, exist_ok=True)
 
     # process all the shards in a process pool
+    fun = partial(process_shard, vocab_size=vocab_size)
     with ProcessPoolExecutor() as executor:
-        executor.map(process_shard, enumerate(shard_filenames))
+        executor.map(fun, enumerate(shard_filenames))
     print("Done.")
 
 
@@ -155,14 +225,29 @@ class Task:
 
 
 if __name__ == "__main__":
+    """
+    These stages are designed to be run in order.
+
+    To tokenize data with the Llama 2 tokenizer:
+    python tinystories.py download
+    python tinystories.py pretokenize
+
+    To tokenize data with a custom tokenizer we train ourselves with sentencepiece, e.g.:
+    python tinystories.py download
+    python tinystories.py train_vocab --vocab_size=2048
+    python tinystories.py pretokenize --vocab_size=2048
+    """
     parser = argparse.ArgumentParser()
-    parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"])
+    parser.add_argument("stage", type=str, choices=["download", "pretokenize", "train_vocab"])
+    parser.add_argument("--vocab_size", type=int, default=0, help="pretokenization vocab size. 0 = use Llama 2 tokenizer.")
     args = parser.parse_args()
 
     # depending on the stage call the appropriate function
-    fun = {
-        "download": download,
-        "pretokenize": pretokenize,
-    }
-    fun[args.stage]()
-
+    if args.stage == "download":
+        download()
+    elif args.stage == "train_vocab":
+        train_vocab(vocab_size=args.vocab_size)
+    elif args.stage == "pretokenize":
+        pretokenize(vocab_size=args.vocab_size)
+    else:
+        raise ValueError(f"Unknown stage {args.stage}")
diff --git a/tokenizer.py b/tokenizer.py
index 35eee20..981b2ac 100644
--- a/tokenizer.py
+++ b/tokenizer.py
@@ -10,14 +10,13 @@ from typing import List
 from sentencepiece import SentencePieceProcessor
 
 TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
-TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C
 
 class Tokenizer:
-    def __init__(self):
-        model_path = TOKENIZER_MODEL
+    def __init__(self, tokenizer_model=None):
+        model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
         assert os.path.isfile(model_path), model_path
         self.sp_model = SentencePieceProcessor(model_file=model_path)
-        #print(f"Loaded SentencePiece model from {model_path}")
+        self.model_path = model_path
 
         # BOS / EOS token IDs
         self.n_words: int = self.sp_model.vocab_size()
@@ -59,12 +58,14 @@ class Tokenizer:
 
             tokens.append(b)
             scores.append(s)
-        
+
         # record the max token length
         max_token_length = max(len(t) for t in tokens)
 
         # write to a binary file
-        with open(TOKENIZER_BIN, 'wb') as f:
+        # the tokenizer.bin file is the same as .model file, but .bin
+        tokenizer_bin = self.model_path.replace('.model', '.bin')
+        with open(tokenizer_bin, 'wb') as f:
             f.write(struct.pack("I", max_token_length))
             for bytes, score in zip(tokens, scores):
                 f.write(struct.pack("fI", score, len(bytes)))
diff --git a/train_vocab.sh b/train_vocab.sh
new file mode 100755
index 0000000..7803af8
--- /dev/null
+++ b/train_vocab.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+# Trains a sentencepiece tokenizer model on a bunch of given data, my best
+# effort attempt to replicate how Meta trained their Llama 2 tokenizer.
+
+# usage: $ train_vocab.sh <input> <model_prefix> <vocab_size>
+# example:
+# ./train_vocab.sh tiny.txt tokenizer_tiny 1024
+# requirements:
+# install https://github.com/google/sentencepiece
+
+# check if the correct number of arguments are provided
+if [ $# -ne 3 ]; then
+    echo "Usage: $0 <input> <model_prefix> <vocab_size>"
+    exit 1
+fi
+
+# assign command-line arguments to variables
+input=$1
+model_prefix=$2
+vocab_size=$3
+
+# check if input file exists
+if [ ! -f "$input" ]; then
+    echo "Usage: $0 <input> <model_prefix> <vocab_size>"
+    echo "input '$input' not found."
+    exit 1
+fi
+
+# check if vocab_size is a positive integer
+if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -lt 1 ]; then
+    echo "Usage: $0 <input> <model_prefix> <vocab_size>"
+    echo "vocab_size size must be a positive integer."
+    exit 1
+fi
+
+# Print the processed inputs
+echo "Input: $input"
+echo "Model Prefix: $model_prefix"
+echo "Vocabulary Size: $vocab_size"
+
+# train a sentencepiece tokenizer model
+# Llama 2 config can be printed as follows:
+
+# import sentencepiece.sentencepiece_model_pb2
+# mp = sentencepiece.sentencepiece_model_pb2.ModelProto()
+# mp.ParseFromString(open("tokenizer.model", "rb").read())
+# print(mp.trainer_spec)
+# print(mp.normalizer_spec)
+
+# this gives:
+
+# trainer_spec {
+#   input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
+#   model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
+#   model_type: BPE
+#   vocab_size: 32000
+#   self_test_sample_size: 0
+#   input_format: "text"
+#   character_coverage: 0.9999499917030334
+#   input_sentence_size: 200000000
+#   seed_sentencepiece_size: 1000000
+#   shrinking_factor: 0.75
+#   num_threads: 80
+#   num_sub_iterations: 2
+#   max_sentence_length: 4192
+#   shuffle_input_sentence: true
+#   max_sentencepiece_length: 16
+#   split_by_unicode_script: true
+#   split_by_whitespace: true
+#   split_by_number: true
+#   treat_whitespace_as_suffix: false
+#   split_digits: true
+#   allow_whitespace_only_pieces: true
+#   vocabulary_output_piece_score: true
+#   hard_vocab_limit: true
+#   use_all_vocab: false
+#   byte_fallback: true
+#   required_chars: ""
+#   unk_id: 0
+#   bos_id: 1
+#   eos_id: 2
+#   pad_id: -1
+#   unk_surface: " \342\201\207 "
+#   unk_piece: "<unk>"
+#   bos_piece: "<s>"
+#   eos_piece: "</s>"
+#   pad_piece: "<pad>"
+#   train_extremely_large_corpus: false
+#   enable_differential_privacy: false
+#   differential_privacy_noise_level: 0.0
+#   differential_privacy_clipping_threshold: 0
+# }
+# normalizer_spec {
+#   name: "identity"
+#   precompiled_charsmap: ""
+#   add_dummy_prefix: true
+#   remove_extra_whitespaces: false
+#   normalization_rule_tsv: ""
+# }
+
+# let's now use spm_train to train this exact model
+# options docs: https://github.com/google/sentencepiece/blob/master/doc/options.md
+
+# we'll depart on a few settings:
+# character_coverage -> 1.0
+
+# other important notes:
+# --split-digits = true, per the paper
+# --allow_whitespace_only_pieces is true, default in spm is false
+# --byte_fallback is true, default in spm is false
+# --normalization_rule_name is identity, default in spm is nmt_nfkc
+
+spm_train --input="$input" \
+          --model_prefix="$model_prefix" \
+          --model_type=bpe \
+          --vocab_size="$vocab_size" \
+          --self_test_sample_size=0 \
+          --input_format="text" \
+          --character_coverage=1.0 \
+          --num_threads="$(nproc)" \
+          --split_digits=true \
+          --allow_whitespace_only_pieces=true \
+          --byte_fallback=true \
+          --unk_surface=" \342\201\207 " \
+          --normalization_rule_name=identity \

From f96c7afb2d6a8cac90c8d64ef97f51ed3cb3d2f7 Mon Sep 17 00:00:00 2001
From: icpp <icpp@icpp.world>
Date: Fri, 11 Aug 2023 10:11:32 -0400
Subject: [PATCH 03/30] Notable fork section for WebAssembly

Added my repo `icpp-lmm` for running it on the Internet Computer
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index be6d8d9..fd0726f 100644
--- a/README.md
+++ b/README.md
@@ -245,6 +245,8 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
   - [llama2.py](https://github.com/tairov/llama2.py) by @[tairov](https://github.com/tairov): a simple one file pure Python port of this project with zero dependencies
 - C#
   - [llama2.cs](https://github.com/trrahul/llama2.cs) by @[trrahul](https://github.com/trrahul): a C# port of this project
+- WebAssembly
+  - [icpp-llm](https://github.com/icppWorld/icpp-llm): LLMs for the Internet Computer
 - [llama2.c - Llama 2 Everywhere](https://github.com/trholding/llama2.c) by @[trholding](https://github.com/trholding): Standalone, Bootable & Portable Binary Llama 2
 
 ## unsorted todos

From b0cfa2458d65747424fb4712f072680e2b3bc330 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Fri, 11 Aug 2023 16:47:29 +0000
Subject: [PATCH 04/30] ok i can train and sample a model with a custom
 tokenizer

---
 model.py       |  5 +++--
 sample.py      |  6 +++++-
 tinystories.py | 37 +++++++++++++++++++++++++++++--------
 train.py       | 14 +++++++++++---
 4 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/model.py b/model.py
index f7edbb6..7329d6c 100644
--- a/model.py
+++ b/model.py
@@ -11,12 +11,13 @@ from torch import nn
 
 @dataclass
 class ModelArgs:
+    # default hyperparameters for the Llama 7B model
     dim: int = 4096
     n_layers: int = 32
     n_heads: int = 32
     n_kv_heads: Optional[int] = None
-    vocab_size: int = -1  # defined later by tokenizer
-    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    vocab_size: int = 32000
+    multiple_of: int = 256  # MLP hidden layer size will be multiple of
     norm_eps: float = 1e-5
     max_seq_len: int = 2048
     dropout: float = 0.0
diff --git a/sample.py b/sample.py
index 040bc14..93c9407 100644
--- a/sample.py
+++ b/sample.py
@@ -9,6 +9,8 @@ import tiktoken
 from model import ModelArgs, Transformer
 from tokenizer import Tokenizer
 
+from tinystories import get_tokenizer_model_path
+
 # -----------------------------------------------------------------------------
 out_dir = 'out' # ignored if init_from is not 'resume'
 start = "" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
@@ -51,7 +53,9 @@ if compile:
     model = torch.compile(model) # requires PyTorch 2.0 (optional)
 
 # load the tokenizer
-enc = Tokenizer()
+assert checkpoint["config"]["dataset"] == "tinystories" # TODO: generalize
+tokenizer_model = get_tokenizer_model_path(vocab_size=gptconf.vocab_size)
+enc = Tokenizer(tokenizer_model=tokenizer_model)
 
 # encode the beginning of the prompt
 if start.startswith('FILE:'):
diff --git a/tinystories.py b/tinystories.py
index d41f8fc..278c817 100644
--- a/tinystories.py
+++ b/tinystories.py
@@ -120,9 +120,7 @@ def train_vocab(vocab_size):
 
 def process_shard(args, vocab_size):
     shard_id, shard = args
-    tokenizer_model = None
-    if vocab_size > 0:
-        tokenizer_model = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model")
+    tokenizer_model = get_tokenizer_model_path()
     enc = Tokenizer(tokenizer_model)
     with open(shard, "r") as f:
         data = json.load(f)
@@ -171,10 +169,12 @@ def pretokenize(vocab_size):
 class PretokDataset(torch.utils.data.IterableDataset):
     """Loads pretokenized examples from disk and yields them as PyTorch tensors."""
 
-    def __init__(self, split, max_seq_len):
+    def __init__(self, split, max_seq_len, vocab_size, vocab_source):
         super().__init__()
         self.split = split
         self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.vocab_source = vocab_source
 
     def __iter__(self):
         # get worker info within a DataLoader
@@ -186,8 +186,14 @@ class PretokDataset(torch.utils.data.IterableDataset):
         seed = 42 + worker_id + 1337 * rank
         rng = random.Random(seed)
         print(f"Created a PretokDataset with rng seed {seed}")
-        data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
-        shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.bin")))
+        if self.vocab_source == "llama2":
+            # the .bin files are right along the .json files
+            bin_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
+            shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
+        elif self.vocab_source == "custom":
+            # the .bin files are in tok{N} directory
+            bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{self.vocab_size}")
+            shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
         # train/test split. let's use only shard 0 for test split, rest train
         shard_filenames = shard_filenames[1:] if self.split == "train" else shard_filenames[:1]
         while True:
@@ -209,12 +215,25 @@ class PretokDataset(torch.utils.data.IterableDataset):
                     y = chunk[1:]
                     yield x, y
 
+# -----------------------------------------------------------------------------
+# public interface functions
+
+def get_tokenizer_model_path(vocab_size):
+    """
+    Returns path to the sentencepiece tokenizer model for a given vocab size
+    vocab_size = 0 designates the default Llama 2 tokenizer, in that case
+    None is returned.
+    """
+    if vocab_size == 0:
+        return None
+    else:
+        return os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model")
 
 class Task:
 
     @staticmethod
-    def iter_batches(split, batch_size, max_seq_len, device, num_workers=0):
-        ds = PretokDataset(split, max_seq_len)
+    def iter_batches(batch_size, device, num_workers=0, **dataset_kwargs):
+        ds = PretokDataset(**dataset_kwargs)
         dl = torch.utils.data.DataLoader(
             ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers
         )
@@ -223,6 +242,8 @@ class Task:
             y = y.to(device, non_blocking=True)
             yield x, y
 
+# -----------------------------------------------------------------------------
+# CLI for constructing the dataset
 
 if __name__ == "__main__":
     """
diff --git a/train.py b/train.py
index dbf0b24..662afcf 100644
--- a/train.py
+++ b/train.py
@@ -47,6 +47,8 @@ wandb_run_name = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
 # data
 batch_size = 128  # if gradient_accumulation_steps > 1, this is the micro-batch size
 max_seq_len = 256
+vocab_source = "custom" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
+vocab_size = 512
 dataset = "tinystories"  # tinystories|tinyshakespeare
 # model
 dim = 288
@@ -83,6 +85,10 @@ config = {k: globals()[k] for k in config_keys}  # will be useful for logging
 lr_decay_iters = max_iters  # should be ~= max_iters per Chinchilla
 min_lr = 0.0  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
 
+# validating checks
+assert vocab_source in ["llama2", "custom"]
+assert vocab_source == "custom" or vocab_size == 32000, "The vocab from Meta has 32K tokens"
+
 # various inits, derived attributes, I/O setup
 ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
 if ddp:
@@ -128,6 +134,8 @@ iter_batches = partial(
     task.iter_batches,
     batch_size=batch_size,
     max_seq_len=max_seq_len,
+    vocab_size=vocab_size,
+    vocab_source=vocab_source,
     device=device,
     num_workers=0,
 )
@@ -142,7 +150,7 @@ model_args = dict(
     n_layers=n_layers,
     n_heads=n_heads,
     n_kv_heads=n_heads,
-    vocab_size=32000,
+    vocab_size=vocab_size,
     multiple_of=multiple_of,
     max_seq_len=max_seq_len,
     dropout=dropout,
@@ -206,7 +214,7 @@ def estimate_loss():
     out = {}
     model.eval()
     for split in ["train", "val"]:
-        batch_iter = iter_batches(split)
+        batch_iter = iter_batches(split=split)
         losses = torch.zeros(eval_iters)  # keep on CPU
         for k in range(eval_iters):
             X, Y = next(batch_iter)
@@ -238,7 +246,7 @@ if wandb_log and master_process:
     wandb.init(project=wandb_project, name=wandb_run_name, config=config)
 
 # training loop
-train_batch_iter = iter_batches("train")
+train_batch_iter = iter_batches(split="train")
 X, Y = next(train_batch_iter)  # fetch the very first batch
 t0 = time.time()
 local_iter_num = 0  # number of iterations in the lifetime of this process

From d421a95b2bfe593b2d9e5c147f3efc8d128afe0e Mon Sep 17 00:00:00 2001
From: Johannes Rudolph <johannes.rudolph@gmail.com>
Date: Sat, 12 Aug 2023 20:31:19 +0200
Subject: [PATCH 05/30] optimize sample_topp by filtering out small value
 elements up front

This works because we know that in worst case only 1 element will be selected
and therefore the remaining (n-1) elements have to split the remaining (1-topp)
probability. Probabilities smaller than that cannot be selected and can
be filtered out up front.
---
 run.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/run.c b/run.c
index afe695f..9fd8f76 100644
--- a/run.c
+++ b/run.c
@@ -465,17 +465,24 @@ int sample_topp(float* probabilities, int n, float topp, ProbIndex* probindex) {
     // tokens that exceed probability topp. This way we never sample tokens that
     // have very low probabilities and are less likely to go "off the rails".
 
+    int n0 = 0;
     // quicksort indices in descending order of probabilities
+    // elements smaller than (1 - topp) / (n - 1) cannot be part of the result
+    // and can be filtered out directly
+    const float cutoff = (1.0f - topp) / (n - 1);
     for (int i = 0; i < n; i++) {
-        probindex[i].index = i;
-        probindex[i].prob = probabilities[i];
+        if (probabilities[i] >= cutoff) {
+            probindex[n0].index = i;
+            probindex[n0].prob = probabilities[i];
+            n0++;
+        }
     }
-    qsort(probindex, n, sizeof(ProbIndex), compare);
+    qsort(probindex, n0, sizeof(ProbIndex), compare);
 
     // truncate the list where cumulative probability exceeds topp
     float cumulative_prob = 0.0f;
-    int last_idx = 0;
-    for (int i = 0; i < n; i++) {
+    int last_idx = n0 - 1; // in case of rounding errors consider all elements
+    for (int i = 0; i < n0; i++) {
         cumulative_prob += probindex[i].prob;
         if (cumulative_prob > topp) {
             last_idx = i;

From ea4cedc5884ddbf18da82dc088f33a3ae980f1c6 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 02:00:19 +0000
Subject: [PATCH 06/30] add ability to export custom tokenizer to .bin format
 for run.c file

---
 tokenizer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tokenizer.py b/tokenizer.py
index 981b2ac..bc2a35a 100644
--- a/tokenizer.py
+++ b/tokenizer.py
@@ -4,7 +4,7 @@
 
 import os
 import struct
-from logging import getLogger
+import argparse
 from typing import List
 
 from sentencepiece import SentencePieceProcessor
@@ -72,5 +72,9 @@ class Tokenizer:
                 f.write(bytes)
 
 if __name__ == "__main__":
-    t = Tokenizer()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-t", "--tokenizer-model", type=str, help="optional path to custom tokenizer ")
+    args = parser.parse_args()
+
+    t = Tokenizer(args.tokenizer_model)
     t.export()

From f5fc0c245fe10826d4b038d9b9ddd3a6bfc01b92 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 02:12:13 +0000
Subject: [PATCH 07/30] final piece: run.c support for new tokenizer, super ez

---
 run.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/run.c b/run.c
index afe695f..14469ad 100644
--- a/run.c
+++ b/run.c
@@ -508,6 +508,7 @@ void error_usage() {
     fprintf(stderr, "  -s <int>    random seed, default time(NULL)\n");
     fprintf(stderr, "  -n <int>    number of steps to run for, default 256. 0 = max_seq_len\n");
     fprintf(stderr, "  -i <string> input prompt\n");
+    fprintf(stderr, "  -z <string> optional path to custom tokenizer\n");
     exit(EXIT_FAILURE);
 }
 
@@ -515,6 +516,7 @@ int main(int argc, char *argv[]) {
 
     // default inits
     char *checkpoint = NULL;  // e.g. out/model.bin
+    char *tokenizer = "tokenizer.bin";
     float temperature = 1.0f; // 0.0 = greedy deterministic. 1.0 = original. don't set higher
     float topp = 1.0f;        // top-p in nucleus sampling. 1.0 = off. 0.9 works well, but slower
     rng_seed = 0; // seed rng with time by default
@@ -534,6 +536,7 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 's') { rng_seed = atoi(argv[i + 1]); }
         else if (argv[i][1] == 'n') { steps = atoi(argv[i + 1]); }
         else if (argv[i][1] == 'i') { prompt = argv[i + 1]; }
+        else if (argv[i][1] == 'z') { tokenizer = argv[i + 1]; }
         else { error_usage(); }
     }
     if(rng_seed == 0) { rng_seed =  (unsigned int)time(NULL);}
@@ -567,13 +570,13 @@ int main(int argc, char *argv[]) {
     // right now we cannot run for more than config.seq_len steps
     if (steps <= 0 || steps > config.seq_len) { steps = config.seq_len; }
 
-    // read in the tokenizer.bin file
+    // read in the tokenizer .bin file
     char** vocab = (char**)malloc(config.vocab_size * sizeof(char*));
     float* vocab_scores = (float*)malloc(config.vocab_size * sizeof(float));
     unsigned int max_token_length;
     {
-        FILE *file = fopen("tokenizer.bin", "rb");
-        if (!file) { fprintf(stderr, "couldn't load tokenizer.bin\n"); return 1; }
+        FILE *file = fopen(tokenizer, "rb");
+        if (!file) { fprintf(stderr, "couldn't load %s\n", tokenizer); return 1; }
         if (fread(&max_token_length, sizeof(int), 1, file) != 1) { fprintf(stderr, "failed read\n"); return 1; }
         int len;
         for (int i = 0; i < config.vocab_size; i++) {

From 00a61dc7f92a94069c0b03bc83c8bf30db1b4aa2 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 02:18:30 +0000
Subject: [PATCH 08/30] remove the tinyshakespeare dataset until i can bring it
 back later in a nicer form, otherwise right now we just have a ton of copy
 paste code here

---
 tinyshakespeare.py | 140 ---------------------------------------------
 train.py           |   5 +-
 2 files changed, 1 insertion(+), 144 deletions(-)
 delete mode 100644 tinyshakespeare.py

diff --git a/tinyshakespeare.py b/tinyshakespeare.py
deleted file mode 100644
index 602624c..0000000
--- a/tinyshakespeare.py
+++ /dev/null
@@ -1,140 +0,0 @@
-"""
-Download, preprocess and serve the TinyShakespeare dataset as a DataLoader.
-
-Follows the same interface as the TinyStories dataset.
-"""
-
-import argparse
-import os
-import random
-
-import numpy as np
-import requests
-import torch
-import torch.distributed as dist
-from tqdm import tqdm
-
-from tokenizer import Tokenizer
-
-DATA_CACHE_DIR = "data"
-
-def download_file(url: str, fname: str, chunk_size=1024):
-    """Helper function to download a file from a given url"""
-    resp = requests.get(url, stream=True)
-    total = int(resp.headers.get("content-length", 0))
-    with open(fname, "wb") as file, tqdm(
-        desc=fname,
-        total=total,
-        unit="iB",
-        unit_scale=True,
-        unit_divisor=1024,
-    ) as bar:
-        for data in resp.iter_content(chunk_size=chunk_size):
-            size = file.write(data)
-            bar.update(size)
-
-
-def download():
-    """Downloads the dataset to disk."""
-    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
-
-    # download the TinyShakespeare dataset, unless it's already downloaded
-    data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
-    data_filename = os.path.join(DATA_CACHE_DIR, "tinyshakespeare.txt")
-    if not os.path.exists(data_filename):
-        print(f"Downloading {data_url} to {data_filename}...")
-        download_file(data_url, data_filename)
-    else:
-        print(f"{data_filename} already exists, skipping download...")
-
-    print("Download done.")
-
-def pretokenize():
-    enc = Tokenizer()
-
-    data_file = os.path.join(DATA_CACHE_DIR, "tinyshakespeare.txt")
-
-    all_tokens = []
-    with open(data_file, "r") as f:
-        for line in f:
-            text = line.strip()
-            tokens = enc.encode(text, bos=True, eos=False)
-            all_tokens.extend(tokens)
-    all_tokens = np.array(all_tokens, dtype=np.uint16)
-    print(f"Total tokens: {len(all_tokens)}")
-    with open(data_file.replace(".txt", ".bin"), "wb") as f:
-        f.write(all_tokens.tobytes())
-    print(f"Saved {data_file.replace('.txt', '.bin')}")
-    print("Done.")
-
-
-class PretokDataset(torch.utils.data.IterableDataset):
-    """Loads pretokenized examples from disk and yields them as PyTorch tensors."""
-
-    def __init__(self, split, max_seq_len):
-        super().__init__()
-        self.split = split
-        self.max_seq_len = max_seq_len
-
-    def __iter__(self):
-        # get worker info within a DataLoader
-        worker_info = torch.utils.data.get_worker_info()
-        worker_id = worker_info.id if worker_info else 0
-        # get DDP rank info
-        rank = dist.get_rank() if dist.is_initialized() else 0
-        # combine the worker_id and worker_rank to create a unique seed for rng
-        seed = 42 + worker_id + 1337 * rank
-        rng = random.Random(seed)
-        print(f"Created a PretokDataset with rng seed {seed}")
-        data_file = os.path.join(DATA_CACHE_DIR, "tinyshakespeare.bin")
-        m_all = np.memmap(data_file, dtype=np.uint16, mode="r")
-
-        # split out 10% of the data for validation
-        split_ix = int(len(m_all) * 0.9)
-        if self.split == "train":
-            m = m_all[:split_ix]
-        else:
-            m = m_all[split_ix:]
-
-        num_batches = len(m) // self.max_seq_len
-        num_batches -= 1  # drop the last partial batch
-        assert num_batches > 0, "this split is way too small? investigate."
-
-        while True:
-            ixs = list(range(num_batches))
-            rng.shuffle(ixs)
-            for ix in ixs:
-                start = ix * self.max_seq_len
-                end = start + self.max_seq_len + 1
-                # calling .astype will copy the data into a new numpy array, now in RAM
-                chunk = torch.from_numpy((m[start:end]).astype(np.int64))
-                x = chunk[:-1]
-                y = chunk[1:]
-                yield x, y
-
-
-class ShakespeareTask:
-
-    @staticmethod
-    def iter_batches(split, batch_size, max_seq_len, device, num_workers=0):
-        ds = PretokDataset(split, max_seq_len)
-        dl = torch.utils.data.DataLoader(
-            ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers
-        )
-        for x, y in dl:
-            x = x.to(device, non_blocking=True)
-            y = y.to(device, non_blocking=True)
-            yield x, y
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"])
-    args = parser.parse_args()
-
-    # depending on the stage call the appropriate function
-    fun = {
-        "download": download,
-        "pretokenize": pretokenize,
-    }
-    fun[args.stage]()
\ No newline at end of file
diff --git a/train.py b/train.py
index 662afcf..39b4f49 100644
--- a/train.py
+++ b/train.py
@@ -29,7 +29,6 @@ from torch.distributed import destroy_process_group, init_process_group
 from torch.nn.parallel import DistributedDataParallel as DDP
 
 from tinystories import Task
-from tinyshakespeare import ShakespeareTask
 
 # -----------------------------------------------------------------------------
 # I/O
@@ -49,7 +48,6 @@ batch_size = 128  # if gradient_accumulation_steps > 1, this is the micro-batch
 max_seq_len = 256
 vocab_source = "custom" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
 vocab_size = 512
-dataset = "tinystories"  # tinystories|tinyshakespeare
 # model
 dim = 288
 n_layers = 6
@@ -129,9 +127,8 @@ ctx = (
 )
 
 # task-specific setup
-task = {'tinystories': Task, 'tinyshakespeare': ShakespeareTask}[dataset]
 iter_batches = partial(
-    task.iter_batches,
+    Task.iter_batches,
     batch_size=batch_size,
     max_seq_len=max_seq_len,
     vocab_size=vocab_size,

From 9c3cfb46a32cc529792f8ae08217035d997c1b3b Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 03:08:07 +0000
Subject: [PATCH 09/30] make default be the llama2 tokenizer

---
 train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/train.py b/train.py
index 39b4f49..24d6fa6 100644
--- a/train.py
+++ b/train.py
@@ -46,8 +46,8 @@ wandb_run_name = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
 # data
 batch_size = 128  # if gradient_accumulation_steps > 1, this is the micro-batch size
 max_seq_len = 256
-vocab_source = "custom" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
-vocab_size = 512
+vocab_source = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
+vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens
 # model
 dim = 288
 n_layers = 6

From fe49eb222c88787853f47fd3ae5223bb6a5419f3 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 03:16:18 +0000
Subject: [PATCH 10/30] readme for custom tokenizers

---
 README.md | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/README.md b/README.md
index be6d8d9..95fd98a 100644
--- a/README.md
+++ b/README.md
@@ -142,6 +142,47 @@ Which gives the same results. More detailed testing will be done in `test_all.py
 $ pytest
 ```
 
+## custom tokenizers
+
+In everything above, we've assumed the custom Lllama 2 tokenizer with 32,000 tokens. However, in many boutique LLMs, using vocabulary this big might be an overkill. If you have a small application you have in mind, you might be much better off training your own tokenizers. This can make everything nicer - with smaller vocabs your model has fewer parameters (because the token embedding table is a lot smaller), the inference is faster (because there are fewer tokens to predict), and your average sequence length per example could also get smaller (because the compression is a lot more efficient on your data). So let's see how we train a custom tokenizer.
+
+By default, to pretokenize the tinystories dataset we had to run, in order:
+
+```
+python tinystories.py download
+python tinystories.py pretokenize
+```
+
+The `pretokenize` stage here loads the Llama 2 tokenizer (vocab size 32,000) and uses it to convert the downloaded text into integers, and saves that to file. We now change this as follows, to train an example 4096-token tokenizer:
+
+```
+python tinystories.py download
+python tinystories.py train_vocab --vocab_size=4096
+python tinystories.py pretokenize --vocab_size=4096
+```
+
+The `train_vocab` stage will call the `train_vocab.sh` script, which calls the `sentencepiece` library to train the tokenizer, storing it in a new file `data/tok4096.model`. I tried to reproduce as well as I could the settings that (I think) Meta used to train their vocabulary. This uses the Byte Pair Encoding algorithm that starts out with raw utf8 byte sequences of the text data and then iteratively merges the most common consecutive pairs of tokens to form the vocabulary. Inspect the `tinystories.py` file - the custom tokenizers are stored in a special directory structure indexed by the vocab size.
+
+Now that we have pretokenized the dataset with our custom tokenizer, we can train the model. The training script `train.py` doesn't care about the exact tokens, it only cares about the vocabulary size so it can correctly initialize the model. So when training your model, make sure to pass in
+
+```
+python train.py --vocab_source=custom --vocab_size=4096
+```
+
+(The defaults are `llama2` and `32000` respectively, which indicates the default Llama 2 tokenizer). This trains the model. Finally we are ready to run inference with our `run.c` script. For that we need two things. Number one, we have to export our tokenizer in the `.bin` format, do that with:
+
+```
+python tokenizer.py --tokenizer-model=data/tok4096.model
+```
+
+This writes the tokenizer to `data/tok4096.bin`. Now we can run inference, pointing it to this tokenizer using the `-z` flag:
+
+```
+./run out/model.bin -z data/tok4096.bin
+```
+
+This should print the samples. If you leave out the `-z` flag, it will use the default Llama 2 tokenizer, which would generate a good sequence of integers, but they would get translated using a different vocabulary to text, so it would look like gibberish.
+
 ## performance
 
 There are many ways to potentially speed up this code depending on your system. Have a look at the [Makefile](Makefile), which contains a lot of notes. The `make run` command currently uses the `-O3` optimization by default, i.e.:

From 1d14cb8dd8884eefa3f15d06263ec4ab95a4b703 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 03:19:35 +0000
Subject: [PATCH 11/30] add note about 4096 vs 32000 token size on tinystories

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 95fd98a..331bb7a 100644
--- a/README.md
+++ b/README.md
@@ -163,6 +163,8 @@ python tinystories.py pretokenize --vocab_size=4096
 
 The `train_vocab` stage will call the `train_vocab.sh` script, which calls the `sentencepiece` library to train the tokenizer, storing it in a new file `data/tok4096.model`. I tried to reproduce as well as I could the settings that (I think) Meta used to train their vocabulary. This uses the Byte Pair Encoding algorithm that starts out with raw utf8 byte sequences of the text data and then iteratively merges the most common consecutive pairs of tokens to form the vocabulary. Inspect the `tinystories.py` file - the custom tokenizers are stored in a special directory structure indexed by the vocab size.
 
+A quick note of interest is that vocab size of 4096 trained specifically on tinystories creates integer sequences with about the same sequence length per example as the default Llama 2 tokenizer of 32000 tokens! This means that our custom, tailored tokenizer is a lot better adapted to our specific text, and can compress it very effectively. So our trained models are smaller and faster.
+
 Now that we have pretokenized the dataset with our custom tokenizer, we can train the model. The training script `train.py` doesn't care about the exact tokens, it only cares about the vocabulary size so it can correctly initialize the model. So when training your model, make sure to pass in
 
 ```

From 9ff459b9258c20a5fcf6539e988f003e6e31f255 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 03:24:31 +0000
Subject: [PATCH 12/30] todo changes

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 331bb7a..2c3614e 100644
--- a/README.md
+++ b/README.md
@@ -292,12 +292,12 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
 
 ## unsorted todos
 
+- revive tests; train a tiny Llama test model (committed to repo) and use it as reference in unit tests
+- make it easier to add a new dataset with not too much pain
 - add multiquery support into run.c
-- add custom bpe training code and the ability to train a smaller vocabulary (32K is to much)
 - should calculate freq_cis online in the script run.c instead of loading them
 - int4/8 quantization
 - export the model in a more sensible output format with a proper header, etc.
-- train a tiny Llama test model (committed to repo) and use it as reference in unit tests
 - support Llama 2 7B Chat models and tune run.c to Chat UI/UX
 - llama2.cu investigate and merge
 - (LoRA) finetuning and export of Llama 2 models

From 27adb082f1b71147616e104081bf6a86a93e06b1 Mon Sep 17 00:00:00 2001
From: Tian Lin <lintian06@users.noreply.github.com>
Date: Sun, 13 Aug 2023 21:58:14 +0800
Subject: [PATCH 13/30] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 2c3614e..57f981c 100644
--- a/README.md
+++ b/README.md
@@ -259,6 +259,7 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
   - [llama2.rs](https://github.com/gaxler/llama2.rs) by @[gaxler](https://github.com/gaxler): a Rust port of this project
   - [llama2.rs](https://github.com/leo-du/llama2.rs) by @[leo-du](https://github.com/leo-du): A Rust port of this project
   - [llama2-rs](https://github.com/danielgrittner/llama2-rs) by @[danielgrittner](https://github.com/danielgrittner): a Rust port of this project
+  - [llama2.rs](https://github.com/lintian06/llama2.rs) by @[lintian06](https://github.com/lintian06): A Rust port of this project
 - Go
   - [go-llama2](https://github.com/tmc/go-llama2) by @[tmc](https://github.com/tmc): a Go port of this project
   - [llama2.go](https://github.com/nikolaydubina/llama2.go) by @[nikolaydubina](https://github.com/nikolaydubina): a Go port of this project

From 570789aa04e2c487c18778d71f16c33f1bf45d04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mihai=20Nad=C4=83=C8=99?= <mihai@nadas.ro>
Date: Sun, 13 Aug 2023 17:49:10 +0300
Subject: [PATCH 14/30] Fixes https://github.com/karpathy/llama2.c/issues/280

There was a small bug in tinystories.py, described here: https://github.com/karpathy/llama2.c/issues/280

This commit simply passes vocab_size to get_tokenizer_model_path to avoid silent crash when processing shards (in process_shard)
---
 tinystories.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tinystories.py b/tinystories.py
index 278c817..690cb02 100644
--- a/tinystories.py
+++ b/tinystories.py
@@ -120,7 +120,7 @@ def train_vocab(vocab_size):
 
 def process_shard(args, vocab_size):
     shard_id, shard = args
-    tokenizer_model = get_tokenizer_model_path()
+    tokenizer_model = get_tokenizer_model_path(vocab_size)
     enc = Tokenizer(tokenizer_model)
     with open(shard, "r") as f:
         data = json.load(f)

From 1d68a36d14b13200a191e5fe88fbd97db4d88a39 Mon Sep 17 00:00:00 2001
From: Oleksandr Nikitin <oleksandr@tvori.info>
Date: Sun, 13 Aug 2023 19:10:07 +0300
Subject: [PATCH 15/30] Add TypeScript port

I've never been so happy to have missed that the JS port already exists :D also it was nice to discover that the JS can reach 80% of the single-threaded C speed (10 tokens/s for TinyStories-110M)
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 2c3614e..62fdb76 100644
--- a/README.md
+++ b/README.md
@@ -271,6 +271,7 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
   - [llama2.cpp](https://github.com/leloykun/llama2.cpp) by @[leloykun](https://github.com/leloykun): a C++ port of this project
 - JavaScript
   - [llama2.js](https://github.com/epicure/llama2.js) by @[epicure](https://github.com/epicure): a JavaScript port of this project
+  - [llama2.ts](https://github.com/wizzard0/llama2.ts) by @[oleksandr_now](https://twitter.com/oleksandr_now): a TypeScript port of this project
   - [llama2.c-emscripten](https://github.com/gohai/llama2.c-emscripten) by @[gohai](https://github.com/gohai): Emscripten (JavaScript) port, based on @ggerganov's initial prototype
 - Zig
   - [llama2.zig](https://github.com/cgbur/llama2.zig) by @[cgbur](https://github.com/cgbur): A Zig port of this project

From 0e6213c6e0f636d9609761b19e6bc97e4109fd95 Mon Sep 17 00:00:00 2001
From: Oleksandr Nikitin <oleksandr@tvori.info>
Date: Sun, 13 Aug 2023 20:02:34 +0300
Subject: [PATCH 16/30] Mention I can run the full 7B model

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 62fdb76..fee7fc5 100644
--- a/README.md
+++ b/README.md
@@ -271,7 +271,7 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
   - [llama2.cpp](https://github.com/leloykun/llama2.cpp) by @[leloykun](https://github.com/leloykun): a C++ port of this project
 - JavaScript
   - [llama2.js](https://github.com/epicure/llama2.js) by @[epicure](https://github.com/epicure): a JavaScript port of this project
-  - [llama2.ts](https://github.com/wizzard0/llama2.ts) by @[oleksandr_now](https://twitter.com/oleksandr_now): a TypeScript port of this project
+  - [llama2.ts](https://github.com/wizzard0/llama2.ts) by @[oleksandr_now](https://twitter.com/oleksandr_now): a TypeScript port of this project. Full Llama2-7B capable.
   - [llama2.c-emscripten](https://github.com/gohai/llama2.c-emscripten) by @[gohai](https://github.com/gohai): Emscripten (JavaScript) port, based on @ggerganov's initial prototype
 - Zig
   - [llama2.zig](https://github.com/cgbur/llama2.zig) by @[cgbur](https://github.com/cgbur): A Zig port of this project

From 38bfac90a887a1f8d7b61849f4ec58e26b267efe Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 19:34:05 +0000
Subject: [PATCH 17/30] bigchange: add multiquery support in run.c. we can now
 train and inference multiquery models (where n_kv_heads < n_heads). this also
 means that we, in principle, support Llama 2 34B and 70B models, which are
 multiquery

---
 README.md |  1 -
 model.py  |  1 +
 run.c     | 53 ++++++++++++++++++++++++++++++-----------------------
 sample.py |  1 -
 train.py  |  3 ++-
 5 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 2c3614e..664fb0f 100644
--- a/README.md
+++ b/README.md
@@ -294,7 +294,6 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
 
 - revive tests; train a tiny Llama test model (committed to repo) and use it as reference in unit tests
 - make it easier to add a new dataset with not too much pain
-- add multiquery support into run.c
 - should calculate freq_cis online in the script run.c instead of loading them
 - int4/8 quantization
 - export the model in a more sensible output format with a proper header, etc.
diff --git a/model.py b/model.py
index 7329d6c..c8c82a9 100644
--- a/model.py
+++ b/model.py
@@ -94,6 +94,7 @@ class Attention(nn.Module):
     def __init__(self, args: ModelArgs):
         super().__init__()
         self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        assert args.n_heads % self.n_kv_heads == 0
         model_parallel_size = 1
         self.n_local_heads = args.n_heads // model_parallel_size
         self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
diff --git a/run.c b/run.c
index 14469ad..4a6e8c2 100644
--- a/run.c
+++ b/run.c
@@ -39,11 +39,11 @@ typedef struct {
     // weights for rmsnorms
     float* rms_att_weight; // (layer, dim) rmsnorm weights
     float* rms_ffn_weight; // (layer, dim)
-    // weights for matmuls
-    float* wq; // (layer, dim, dim)
-    float* wk; // (layer, dim, dim)
-    float* wv; // (layer, dim, dim)
-    float* wo; // (layer, dim, dim)
+    // weights for matmuls. note dim == n_heads * head_size
+    float* wq; // (layer, dim, n_heads * head_size)
+    float* wk; // (layer, dim, n_kv_heads * head_size)
+    float* wv; // (layer, dim, n_kv_heads * head_size)
+    float* wo; // (layer, n_heads * head_size, dim)
     // weights for ffn
     float* w1; // (layer, hidden_dim, dim)
     float* w2; // (layer, dim, hidden_dim)
@@ -82,6 +82,7 @@ typedef struct {
 
 void malloc_run_state(RunState* s, Config* p) {
     // we calloc instead of malloc to keep valgrind happy
+    int kv_dim = (p->dim * p->n_kv_heads) / p->n_heads;
     s->x = calloc(p->dim, sizeof(float));
     s->xb = calloc(p->dim, sizeof(float));
     s->xb2 = calloc(p->dim, sizeof(float));
@@ -93,8 +94,8 @@ void malloc_run_state(RunState* s, Config* p) {
     s->att = calloc(p->n_heads * p->seq_len, sizeof(float));
     s->logits = calloc(p->vocab_size, sizeof(float));
     s->probindex = calloc(p->vocab_size, sizeof(ProbIndex));
-    s->key_cache = calloc(p->n_layers * p->seq_len * p->dim, sizeof(float));
-    s->value_cache = calloc(p->n_layers * p->seq_len * p->dim, sizeof(float));
+    s->key_cache = calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float));
+    s->value_cache = calloc(p->n_layers * p->seq_len * kv_dim, sizeof(float));
     // ensure all mallocs went fine
     if (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q
      || !s->k || !s->v || !s->att || !s->logits || !s->key_cache
@@ -124,19 +125,20 @@ void free_run_state(RunState* s) {
 // initialization: read from checkpoint
 
 void checkpoint_init_weights(TransformerWeights *w, Config* p, float* f, int shared_weights) {
+    int head_size = p->dim / p->n_heads;
     float* ptr = f;
     w->token_embedding_table = ptr;
     ptr += p->vocab_size * p->dim;
     w->rms_att_weight = ptr;
     ptr += p->n_layers * p->dim;
     w->wq = ptr;
-    ptr += p->n_layers * p->dim * p->dim;
+    ptr += p->n_layers * p->dim * (p->n_heads * head_size);
     w->wk = ptr;
-    ptr += p->n_layers * p->dim * p->dim;
+    ptr += p->n_layers * p->dim * (p->n_kv_heads * head_size);
     w->wv = ptr;
-    ptr += p->n_layers * p->dim * p->dim;
+    ptr += p->n_layers * p->dim * (p->n_kv_heads * head_size);
     w->wo = ptr;
-    ptr += p->n_layers * p->dim * p->dim;
+    ptr += p->n_layers * (p->n_heads * head_size) * p->dim;
     w->rms_ffn_weight = ptr;
     ptr += p->n_layers * p->dim;
     w->w1 = ptr;
@@ -148,7 +150,6 @@ void checkpoint_init_weights(TransformerWeights *w, Config* p, float* f, int sha
     w->rms_final_weight = ptr;
     ptr += p->dim;
     w->freq_cis_real = ptr;
-    int head_size = p->dim / p->n_heads;
     ptr += p->seq_len * head_size / 2;
     w->freq_cis_imag = ptr;
     ptr += p->seq_len * head_size / 2;
@@ -218,6 +219,8 @@ void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights*
     // a few convenience variables
     float *x = s->x;
     int dim = p->dim;
+    int kv_dim = (p->dim * p->n_kv_heads) / p->n_heads;
+    int kv_mul = p->n_heads / p->n_kv_heads; // integer multiplier of the kv sharing in multiquery
     int hidden_dim =  p->hidden_dim;
     int head_size = dim / p->n_heads;
 
@@ -237,29 +240,33 @@ void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights*
 
         // qkv matmuls for this position
         matmul(s->q, s->xb, w->wq + l*dim*dim, dim, dim);
-        matmul(s->k, s->xb, w->wk + l*dim*dim, dim, dim);
-        matmul(s->v, s->xb, w->wv + l*dim*dim, dim, dim);
+        matmul(s->k, s->xb, w->wk + l*dim*kv_dim, dim, kv_dim);
+        matmul(s->v, s->xb, w->wv + l*dim*kv_dim, dim, kv_dim);
 
         // RoPE relative positional encoding: complex-valued rotate q and k by freq_cis in each head
         for (int i = 0; i < dim; i+=2) {
             float q0 = s->q[i];
             float q1 = s->q[i+1];
-            float k0 = s->k[i];
-            float k1 = s->k[i+1];
             float fcr = freq_cis_real_row[(i % head_size) / 2];
             float fci = freq_cis_imag_row[(i % head_size) / 2];
             s->q[i]   = q0 * fcr - q1 * fci;
             s->q[i+1] = q0 * fci + q1 * fcr;
+        }
+        for (int i = 0; i < kv_dim; i+=2) {
+            float k0 = s->k[i];
+            float k1 = s->k[i+1];
+            float fcr = freq_cis_real_row[(i % head_size) / 2];
+            float fci = freq_cis_imag_row[(i % head_size) / 2];
             s->k[i]   = k0 * fcr - k1 * fci;
             s->k[i+1] = k0 * fci + k1 * fcr;
         }
 
         // save key,value at this time step (pos) to our kv cache
-        int loff = l * p->seq_len * dim; // kv cache layer offset for convenience
-        float* key_cache_row = s->key_cache + loff + pos * dim;
-        float* value_cache_row = s->value_cache + loff + pos * dim;
-        memcpy(key_cache_row, s->k, dim*sizeof(*key_cache_row));
-        memcpy(value_cache_row, s->v, dim*sizeof(*value_cache_row));
+        int loff = l * p->seq_len * kv_dim; // kv cache layer offset for convenience
+        float* key_cache_row = s->key_cache + loff + pos * kv_dim;
+        float* value_cache_row = s->value_cache + loff + pos * kv_dim;
+        memcpy(key_cache_row, s->k, kv_dim * sizeof(*key_cache_row));
+        memcpy(value_cache_row, s->v, kv_dim * sizeof(*value_cache_row));
 
         // multihead attention. iterate over all heads
         int h;
@@ -272,7 +279,7 @@ void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights*
             // iterate over all timesteps, including the current one
             for (int t = 0; t <= pos; t++) {
                 // get the key vector for this head and at this timestep
-                float* k = s->key_cache + loff + t * dim + h * head_size;
+                float* k = s->key_cache + loff + t * kv_dim + (h / kv_mul) * head_size;
                 // calculate the attention score as the dot product of q and k
                 float score = 0.0f;
                 for (int i = 0; i < head_size; i++) {
@@ -291,7 +298,7 @@ void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights*
             memset(xb, 0, head_size * sizeof(float));
             for (int t = 0; t <= pos; t++) {
                 // get the value vector for this head and at this timestep
-                float* v = s->value_cache + loff + t * dim + h * head_size;
+                float* v = s->value_cache + loff + t * kv_dim + (h / kv_mul) * head_size;
                 // get the attention weight for this timestep
                 float a = att[t];
                 // accumulate the weighted value into xb
diff --git a/sample.py b/sample.py
index 93c9407..2f66e7f 100644
--- a/sample.py
+++ b/sample.py
@@ -53,7 +53,6 @@ if compile:
     model = torch.compile(model) # requires PyTorch 2.0 (optional)
 
 # load the tokenizer
-assert checkpoint["config"]["dataset"] == "tinystories" # TODO: generalize
 tokenizer_model = get_tokenizer_model_path(vocab_size=gptconf.vocab_size)
 enc = Tokenizer(tokenizer_model=tokenizer_model)
 
diff --git a/train.py b/train.py
index 24d6fa6..b1972dc 100644
--- a/train.py
+++ b/train.py
@@ -52,6 +52,7 @@ vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens
 dim = 288
 n_layers = 6
 n_heads = 6
+n_kv_heads = 6
 multiple_of = 32
 dropout = 0.0
 # adamw optimizer
@@ -146,7 +147,7 @@ model_args = dict(
     dim=dim,
     n_layers=n_layers,
     n_heads=n_heads,
-    n_kv_heads=n_heads,
+    n_kv_heads=n_kv_heads,
     vocab_size=vocab_size,
     multiple_of=multiple_of,
     max_seq_len=max_seq_len,

From 58075b5ac5935d1f22c3935fdedbcf60de3e1474 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 20:31:32 +0000
Subject: [PATCH 18/30] update API of sample.py to be better, small changes
 here

---
 README.md |  3 +--
 sample.py | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 9b054dc..19db674 100644
--- a/README.md
+++ b/README.md
@@ -132,8 +132,7 @@ Watch the tokens stream by, fun! We can also run the PyTorch inference script fo
 
 ```bash
 wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt -P out15M
-mv out15M/stories15M.pt out15M/ckpt.pt # sorry the sample script current assumes this directory structure / filename...
-python sample.py --out_dir=out15M
+python sample.py --checkpoint=out15M/stories15M.pt
 ```
 
 Which gives the same results. More detailed testing will be done in `test_all.py`. Currently you will need two files to test or sample: both the .bin file, and the .ckpt file inside a directory (see `test_all.py` for details). Sorry this is a bit janky right now, I have to think through running the tests without having to download 200MB of data. But run the tests with pytest:
diff --git a/sample.py b/sample.py
index 2f66e7f..64bb177 100644
--- a/sample.py
+++ b/sample.py
@@ -12,12 +12,13 @@ from tokenizer import Tokenizer
 from tinystories import get_tokenizer_model_path
 
 # -----------------------------------------------------------------------------
-out_dir = 'out' # ignored if init_from is not 'resume'
+checkpoint = 'out/ckpt.pt'
 start = "" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
 num_samples = 1 # number of samples to draw
 max_new_tokens = 100 # number of tokens generated in each sample
 temperature = 1.0 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
 top_k = 300 # retain only the top_k most likely tokens, clamp others to have 0 probability
+tokenizer = "" # override the tokenizer model path
 seed = 1337
 device = 'cuda' if torch.cuda.is_available() else 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
 #dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
@@ -35,11 +36,10 @@ ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torc
 ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
 
 # init from a model saved in a specific directory
-ckpt_path = os.path.join(out_dir, 'ckpt.pt')
-checkpoint = torch.load(ckpt_path, map_location=device)
-gptconf = ModelArgs(**checkpoint['model_args'])
+checkpoint_dict = torch.load(checkpoint, map_location=device)
+gptconf = ModelArgs(**checkpoint_dict['model_args'])
 model = Transformer(gptconf)
-state_dict = checkpoint['model']
+state_dict = checkpoint_dict['model']
 unwanted_prefix = '_orig_mod.'
 for k,v in list(state_dict.items()):
     if k.startswith(unwanted_prefix):
@@ -52,8 +52,11 @@ if compile:
     print("Compiling the model...")
     model = torch.compile(model) # requires PyTorch 2.0 (optional)
 
-# load the tokenizer
-tokenizer_model = get_tokenizer_model_path(vocab_size=gptconf.vocab_size)
+# load the tokenizer, either provided, or attempt to find it
+if tokenizer:
+    tokenizer_model = tokenizer
+else:
+    tokenizer_model = get_tokenizer_model_path(vocab_size=gptconf.vocab_size)
 enc = Tokenizer(tokenizer_model=tokenizer_model)
 
 # encode the beginning of the prompt

From 3e989e21f2c25b29caa9ea9f7e22bdb4385c4780 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 20:38:05 +0000
Subject: [PATCH 19/30] link to stories260K model

---
 README.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 19db674..8b49b74 100644
--- a/README.md
+++ b/README.md
@@ -85,11 +85,12 @@ base models... ¯\\_(ツ)_/¯. Since we can inference the base model, it should
 
 For the sake of examples of smaller, from-scratch models, I trained a small model series on TinyStories. All of these trained in a few hours on my training setup (4X A100 40GB GPUs). The 110M took around 24 hours. I am hosting them on huggingface hub [tinyllamas](https://huggingface.co/karpathy/tinyllamas), both in the original PyTorch .pt, and also in the llama2.c format .bin:
 
-| model | dim | n_layers | n_heads | max context length | parameters | val loss | download
-| --- | --- | --- | --- | --- | --- | --- | --- |
-| OG | 288 | 6 | 6 | 256 | 15M | 1.072 | [stories15M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin) |
-| 42M| 512 | 8 | 8 | 1024 | 42M | 0.847 | [stories42M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.bin) |
-| 110M| 768 | 12 | 12 | 1024 | 110M | 0.760 | [stories110M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.bin) |
+| model | dim | n_layers | n_heads | n_kv_heads | max context length | parameters | val loss | download
+| --- | --- | --- | | --- | --- | --- | --- | --- | --- |
+| 260K | 64 | 5 | 8 | 4 | 512 | 260K | 1.2968 | [stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K)
+| OG | 288 | 6 | 6 | 6 | 256 | 15M | 1.072 | [stories15M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin) |
+| 42M| 512 | 8 | 8 | 8 | 1024 | 42M | 0.847 | [stories42M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.bin) |
+| 110M| 768 | 12 | 12 | 12 | 1024 | 110M | 0.760 | [stories110M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.bin) |
 
 You'll notice that the 110M model is equivalent to GPT-1 in size. Alternatively, this is also the smallest model in the GPT-2 series (`GPT-2 small`), except the max context length is only 1024 instead of 2048. The only notable changes from GPT-1/2 architecture is that Llama uses RoPE relatively positional embeddings instead of absolute/learned positional embeddings, a bit more fancy SwiGLU non-linearity in the MLP, RMSNorm instead of LayerNorm, bias=False on all Linear layers, and is optionally multiquery (but this is not yet supported in llama2.c).
 

From b2cce341e06bb4699edc8307812643a1da9943c7 Mon Sep 17 00:00:00 2001
From: Andrej <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 13:39:12 -0700
Subject: [PATCH 20/30] oops typo fix in readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8b49b74..15efce0 100644
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ base models... ¯\\_(ツ)_/¯. Since we can inference the base model, it should
 For the sake of examples of smaller, from-scratch models, I trained a small model series on TinyStories. All of these trained in a few hours on my training setup (4X A100 40GB GPUs). The 110M took around 24 hours. I am hosting them on huggingface hub [tinyllamas](https://huggingface.co/karpathy/tinyllamas), both in the original PyTorch .pt, and also in the llama2.c format .bin:
 
 | model | dim | n_layers | n_heads | n_kv_heads | max context length | parameters | val loss | download
-| --- | --- | --- | | --- | --- | --- | --- | --- | --- |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- |
 | 260K | 64 | 5 | 8 | 4 | 512 | 260K | 1.2968 | [stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K)
 | OG | 288 | 6 | 6 | 6 | 256 | 15M | 1.072 | [stories15M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin) |
 | 42M| 512 | 8 | 8 | 8 | 1024 | 42M | 0.847 | [stories42M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.bin) |

From 0805cb2c317146a38e893bad8286b0f14860fe97 Mon Sep 17 00:00:00 2001
From: Andrej <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 13:40:09 -0700
Subject: [PATCH 21/30] tiny whitespace fix to try to eliminate scrollbar

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 15efce0..5c04483 100644
--- a/README.md
+++ b/README.md
@@ -87,7 +87,7 @@ For the sake of examples of smaller, from-scratch models, I trained a small mode
 
 | model | dim | n_layers | n_heads | n_kv_heads | max context length | parameters | val loss | download
 | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| 260K | 64 | 5 | 8 | 4 | 512 | 260K | 1.2968 | [stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K)
+| 260K | 64 | 5 | 8 | 4 | 512 | 260K | 1.297 | [stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K)
 | OG | 288 | 6 | 6 | 6 | 256 | 15M | 1.072 | [stories15M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin) |
 | 42M| 512 | 8 | 8 | 8 | 1024 | 42M | 0.847 | [stories42M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.bin) |
 | 110M| 768 | 12 | 12 | 12 | 1024 | 110M | 0.760 | [stories110M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.bin) |

From f0024cfc885a1f5bac58200ee4aaf00caefcf0b4 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 21:22:44 +0000
Subject: [PATCH 22/30] revive tests. now that we have a tiny stories260K model
 this only requires a 2MB download. phew

---
 README.md   | 17 +++++++----
 test_all.py | 86 ++++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index 5c04483..d2a478a 100644
--- a/README.md
+++ b/README.md
@@ -136,11 +136,7 @@ wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt -P ou
 python sample.py --checkpoint=out15M/stories15M.pt
 ```
 
-Which gives the same results. More detailed testing will be done in `test_all.py`. Currently you will need two files to test or sample: both the .bin file, and the .ckpt file inside a directory (see `test_all.py` for details). Sorry this is a bit janky right now, I have to think through running the tests without having to download 200MB of data. But run the tests with pytest:
-
-```bash
-$ pytest
-```
+Which gives the same results.
 
 ## custom tokenizers
 
@@ -227,6 +223,17 @@ On **Windows**, use `build_msvc.bat` in a Visual Studio Command Prompt to build
 
 On **Centos 7**, **Amazon Linux 2018** use `rungnu` Makefile target: `make rungnu` or `make runompgnu` to use openmp.
 
+## tests
+
+You can run tests simply with pytest:
+
+```bash
+$ pip install pytest
+$ pytest
+```
+
+This will currently invoke two tests inside `test_all.py`, which forward the model in both C and Python for 200 steps and check the output against a known good expected output. The tests currently run in only a few seconds, but will have to download and cache the stories260K models in a temporary `test` directory (only ~2MB download).
+
 ## ack
 
 I trained the llama2.c storyteller models on a 4X A100 40GB box graciously provided by the excellent [Lambda labs](https://lambdalabs.com/service/gpu-cloud), thank you.
diff --git a/test_all.py b/test_all.py
index 8563614..e8590ea 100644
--- a/test_all.py
+++ b/test_all.py
@@ -4,37 +4,65 @@ $ pytest
 """
 import os
 import pytest # pip install pytest
+import requests
 import subprocess
 
+
 import torch
 from model import ModelArgs, Transformer
+from tokenizer import Tokenizer
 
-def test_argmax_inference():
-    """
-    Only the simplest test for now: run inference with temperature 0 
-    (for determinism) in both C and PyTorch, and see that the sampled tokens 
-    are the same.
-    """
-    test_ckpt_dir = "out" # TODO create a dummy test checkpoint for this?
+# -----------------------------------------------------------------------------
+# test utilities
 
-    # run C version
-    model_path = os.path.join(test_ckpt_dir, "model.bin")
-    command = ["./run", model_path, "0.0"]
-    proc = subprocess.Popen(command, stdout=subprocess.PIPE)
-    c_tokens = []
-    for line in proc.stdout:
-        token = int(line.decode('utf-8').strip())
-        c_tokens.append(token)
-    proc.wait()
-    #print(c_tokens)
+test_ckpt_dir = "test"
 
-    # run PyTorch version
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    ckpt_path = os.path.join(test_ckpt_dir, "ckpt.pt")
-    checkpoint = torch.load(ckpt_path, map_location=device)
-    gptconf = ModelArgs(**checkpoint['model_args'])
+def download_file(url, filename):
+    print(f"Downloading {url} to {filename}")
+    response = requests.get(url, stream=True)
+    response.raise_for_status() # Raise an HTTPError on bad status code
+    with open(filename, 'wb') as file:
+        for chunk in response.iter_content(chunk_size=8192):
+            file.write(chunk)
+
+def attempt_download_files():
+    os.makedirs(test_ckpt_dir, exist_ok=True)
+    root_url = "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K"
+    need = ["stories260K.bin", "stories260K.pt", "tok512.bin", "tok512.model"]
+    for file in need:
+        url = os.path.join(root_url, file)
+        filename = os.path.join(test_ckpt_dir, file)
+        if not os.path.exists(filename):
+            download_file(url, filename)
+
+expected_stdout = b'Once upon a time, there was a little girl named Lily. She loved to play outside in the park. One day, she saw a big, red ball. She wanted to play with it, but it was too high.\nLily\'s mom said, "Lily, let\'s go to the park." Lily was sad and didn\'t know what to do. She said, "I want to play with your ball, but I can\'t find it."\nLily was sad and didn\'t know what to do. She said, "I\'m sorry, Lily. I didn\'t know what to do."\nLily didn\'t want to help her mom, so she'
+
+# -----------------------------------------------------------------------------
+# actual tests
+
+def test_runc():
+    """ Forwards a model against a known-good desired outcome in run.c for 200 steps"""
+
+    model_path = os.path.join(test_ckpt_dir, "stories260K.bin")
+    tokenizer_path = os.path.join(test_ckpt_dir, "tok512.bin")
+    command = ["./run", model_path, "-z", tokenizer_path, "-t", "0.0", "-n", "200"]
+    proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    stdout, stderr = proc.communicate()
+
+    # strip the very last \n that is added by run.c for aesthetic reasons
+    stdout = stdout[:-1]
+    assert stdout == expected_stdout
+
+def test_python():
+    """ Forwards a model against a known-good desired outcome in sample.py for 200 steps"""
+
+    device = "cpu" # stories260K is small enough to just breeze through it on CPU
+    checkpoint = os.path.join(test_ckpt_dir, "stories260K.pt")
+    checkpoint_dict = torch.load(checkpoint, map_location=device)
+    gptconf = ModelArgs(**checkpoint_dict['model_args'])
     model = Transformer(gptconf)
-    state_dict = checkpoint['model']
+    state_dict = checkpoint_dict['model']
     unwanted_prefix = '_orig_mod.'
     for k,v in list(state_dict.items()):
         if k.startswith(unwanted_prefix):
@@ -44,10 +72,12 @@ def test_argmax_inference():
     model.to(device)
     x = torch.tensor([[1]], dtype=torch.long, device=device) # 1 is BOS
     with torch.inference_mode():
-        y = model.generate(x, max_new_tokens=gptconf.max_seq_len, temperature=0.0)
+        y = model.generate(x, max_new_tokens=200, temperature=0.0)
     pt_tokens = y[0].tolist()
-    pt_tokens = pt_tokens[1:] # remove BOS
-    #print(pt_tokens)
 
-    # compare
-    assert c_tokens == pt_tokens
+    tokenizer_model = os.path.join(test_ckpt_dir, "tok512.model")
+    enc = Tokenizer(tokenizer_model=tokenizer_model)
+    text = enc.decode(pt_tokens)
+    text = text.encode('ascii') # turn into bytes
+
+    assert text == expected_stdout
\ No newline at end of file

From 850603618597cd1ef88de482b2ba49be2190cfd1 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 21:23:27 +0000
Subject: [PATCH 23/30] remove 'revive tests' as a todo from the readme

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index d2a478a..e14e39f 100644
--- a/README.md
+++ b/README.md
@@ -302,7 +302,6 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
 
 ## unsorted todos
 
-- revive tests; train a tiny Llama test model (committed to repo) and use it as reference in unit tests
 - make it easier to add a new dataset with not too much pain
 - should calculate freq_cis online in the script run.c instead of loading them
 - int4/8 quantization

From 86325bf7e83392e488e4442649b65f73d70d2b07 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 23:35:29 +0000
Subject: [PATCH 24/30] attempt to upgrade the CI to run our pytest

---
 .github/workflows/build.yml | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a954469..f8b216b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -4,10 +4,12 @@ on:
   push:
     branches:
       - master
-    paths: ['.github/workflows/**', '**/Makefile', '**/*.c', '**/*.h']
+    paths: ['.github/workflows/**', '**/Makefile', '**/*.c', '**/*.h', '**/*.py']
   pull_request:
     types: [opened, synchronize, reopened]
-    paths: ['**/Makefile', '**/*.c', '**/*.h']
+    paths: ['**/Makefile', '**/*.c', '**/*.h', '**/*.py']
+  # for manual triggering
+  workflow_dispatch:
 
 env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -15,7 +17,7 @@ env:
 jobs:
   # check basic builds to avoid breaking changes
   ubuntu-focal-make:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
 
     steps:
       - name: Clone
@@ -28,6 +30,16 @@ jobs:
           sudo apt-get update
           sudo apt-get install build-essential -y
 
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.10"
+
+      - name: Pip setup
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+
       - name: Build
         id: make_build
         run: |
@@ -38,6 +50,10 @@ jobs:
         run: |
           make runfast
 
+      - name: Test with pytest
+        run: |
+          pytest
+
   macOS-latest-make:
     runs-on: macos-latest
 

From 223a67048adede28f43993dd862d49d6950c4347 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 23:39:37 +0000
Subject: [PATCH 25/30] add optional manual dispatch of actions

---
 .github/workflows/build.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a954469..13b5be4 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -8,6 +8,8 @@ on:
   pull_request:
     types: [opened, synchronize, reopened]
     paths: ['**/Makefile', '**/*.c', '**/*.h']
+  # for manual triggering
+  workflow_dispatch:
 
 env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}

From c970f69334fa8f07a8d359430097bca86a96e754 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 23:48:01 +0000
Subject: [PATCH 26/30] oops i should probably call this function lol

---
 test_all.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_all.py b/test_all.py
index e8590ea..625af44 100644
--- a/test_all.py
+++ b/test_all.py
@@ -42,6 +42,7 @@ expected_stdout = b'Once upon a time, there was a little girl named Lily. She lo
 
 def test_runc():
     """ Forwards a model against a known-good desired outcome in run.c for 200 steps"""
+    attempt_download_files()
 
     model_path = os.path.join(test_ckpt_dir, "stories260K.bin")
     tokenizer_path = os.path.join(test_ckpt_dir, "tok512.bin")
@@ -56,6 +57,7 @@ def test_runc():
 
 def test_python():
     """ Forwards a model against a known-good desired outcome in sample.py for 200 steps"""
+    attempt_download_files()
 
     device = "cpu" # stories260K is small enough to just breeze through it on CPU
     checkpoint = os.path.join(test_ckpt_dir, "stories260K.pt")

From 854c97b660fc8527a979ab5cf26436a6146f2ade Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Mon, 14 Aug 2023 00:12:45 +0000
Subject: [PATCH 27/30] turn topp 0.9 back on by default thanks to recent PR
 contributions truncating before quicksort

---
 README.md | 2 +-
 run.c     | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index fe1b32f..99416d5 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ You can also prompt the model with a prefix or a number of additional command li
 
 There is also an even better 110M param model available, see [models](#models).
 
-Quick note on sampling, the recommendation for ~best results is to sample with `-t 1.0 -p 0.9`, i.e. temperature 1.0 (default) but also top-p sampling at 0.9 (not default!). The top-p sampling is turned off by default because it can run quite a bit slower. More generally, to control the diversity of samples use either the temperature (i.e. vary `-t` between 0 and 1 and keep top-p off with `-p 0`) or the top-p value (i.e. vary `-p` between 0 and 1 and keep `-t 1`), but not both. Nice explainers on LLM sampling strategies include [this](https://peterchng.com/blog/2023/05/02/token-selection-strategies-top-k-top-p-and-temperature/), [this](https://docs.cohere.com/docs/controlling-generation-with-top-k-top-p) or [this](https://huggingface.co/blog/how-to-generate).
+Quick note on sampling, the recommendation for ~best results is to sample with `-t 1.0 -p 0.9`, i.e. temperature 1.0 (default) but also top-p sampling at 0.9 (default). Intuitively, top-p ensures that tokens with tiny probabilities do not get sampled, so we can't get "unlucky" during sampling, and we are less likely to go "off the rails" afterwards. More generally, to control the diversity of samples use either the temperature (i.e. vary `-t` between 0 and 1 and keep top-p off with `-p 0`) or the top-p value (i.e. vary `-p` between 0 and 1 and keep `-t 1`), but not both. Nice explainers on LLM sampling strategies include [this](https://peterchng.com/blog/2023/05/02/token-selection-strategies-top-k-top-p-and-temperature/), [this](https://docs.cohere.com/docs/controlling-generation-with-top-k-top-p) or [this](https://huggingface.co/blog/how-to-generate).
 
 ## Meta's Llama 2 models
 
diff --git a/run.c b/run.c
index d66e838..426e7e8 100644
--- a/run.c
+++ b/run.c
@@ -474,8 +474,8 @@ int sample_topp(float* probabilities, int n, float topp, ProbIndex* probindex) {
 
     int n0 = 0;
     // quicksort indices in descending order of probabilities
-    // elements smaller than (1 - topp) / (n - 1) cannot be part of the result
-    // and can be filtered out directly
+    // values smaller than (1 - topp) / (n - 1) cannot be part of the result
+    // so for efficiency we crop these out as candidates before sorting
     const float cutoff = (1.0f - topp) / (n - 1);
     for (int i = 0; i < n; i++) {
         if (probabilities[i] >= cutoff) {
@@ -518,7 +518,7 @@ void error_usage() {
     fprintf(stderr, "Example: run model.bin -n 256 -i \"Once upon a time\"\n");
     fprintf(stderr, "Options:\n");
     fprintf(stderr, "  -t <float>  temperature, default 1.0\n");
-    fprintf(stderr, "  -p <float>  p value in top-p (nucleus) sampling. default 1.0 (=off)\n");
+    fprintf(stderr, "  -p <float>  p value in top-p (nucleus) sampling. default 0.9\n");
     fprintf(stderr, "  -s <int>    random seed, default time(NULL)\n");
     fprintf(stderr, "  -n <int>    number of steps to run for, default 256. 0 = max_seq_len\n");
     fprintf(stderr, "  -i <string> input prompt\n");
@@ -532,7 +532,7 @@ int main(int argc, char *argv[]) {
     char *checkpoint = NULL;  // e.g. out/model.bin
     char *tokenizer = "tokenizer.bin";
     float temperature = 1.0f; // 0.0 = greedy deterministic. 1.0 = original. don't set higher
-    float topp = 1.0f;        // top-p in nucleus sampling. 1.0 = off. 0.9 works well, but slower
+    float topp = 0.9f;        // top-p in nucleus sampling. 1.0 = off. 0.9 works well, but slower
     rng_seed = 0; // seed rng with time by default
     int steps = 256;          // number of steps to run for
     char *prompt = NULL;      // prompt string

From 45afa91dca8808f4d767d132210e7093c42f004c Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Mon, 14 Aug 2023 02:54:27 +0000
Subject: [PATCH 28/30] the accum function has been bothering me, there is no
 real need to add a function here, it does something trivial and is only used
 twice, scrap

---
 run.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/run.c b/run.c
index 426e7e8..df95e6f 100644
--- a/run.c
+++ b/run.c
@@ -159,12 +159,6 @@ void checkpoint_init_weights(TransformerWeights *w, Config* p, float* f, int sha
 // ----------------------------------------------------------------------------
 // neural net blocks
 
-void accum(float *a, float *b, int size) {
-    for (int i = 0; i < size; i++) {
-        a[i] += b[i];
-    }
-}
-
 void rmsnorm(float* o, float* x, float* weight, int size) {
     // calculate sum of squares
     float ss = 0.0f;
@@ -312,7 +306,9 @@ void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights*
         matmul(s->xb2, s->xb, w->wo + l*dim*dim, dim, dim);
 
         // residual connection back into x
-        accum(x, s->xb2, dim);
+        for (int i = 0; i < dim; i++) {
+            x[i] += s->xb2[i];
+        }
 
         // ffn rmsnorm
         rmsnorm(s->xb, x, w->rms_ffn_weight + l*dim, dim);
@@ -336,7 +332,9 @@ void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights*
         matmul(s->xb, s->hb, w->w2 + l*dim*hidden_dim, hidden_dim, dim);
 
         // residual connection
-        accum(x, s->xb, dim);
+        for (int i = 0; i < dim; i++) {
+            x[i] += s->xb[i];
+        }
     }
 
     // final rmsnorm

From bae0bcf484493df65097a9fdae8b6157f338bf8d Mon Sep 17 00:00:00 2001
From: Andrej <andrej.karpathy@gmail.com>
Date: Sun, 13 Aug 2023 20:03:00 -0700
Subject: [PATCH 29/30] Small tweaks to Readme intro

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 99416d5..a180208 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,11 @@
   <img src="assets/llama_cute.jpg" width="300" height="300" alt="Cute Llama">
 </p>
 
-With the code in this repo you can train the Llama 2 LLM architecture from scratch in PyTorch, then export the weights to a binary file, and load that into one ~simple 500-line C file ([run.c](run.c)) that inferences the model. Alternatively, you can load, finetune, and inference Meta's Llama 2 (but this is still being actively fleshed out). Hence, this repo is a "fullstack" train + inference solution for Llama 2 LLM, with a focus on minimalism and simplicity. You might think that you need many billion parameter LLMs to do anything useful, but in fact very small LLMs can have surprisingly strong performance if you make the domain narrow enough. I recommend looking at the [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) paper for inspiration.
+Train the Llama 2 LLM architecture in PyTorch then inference it with one simple 700-line C file ([run.c](run.c)). You might think that you need many billion parameter LLMs to do anything useful, but in fact very small LLMs can have surprisingly strong performance if you make the domain narrow enough (ref: [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) paper). This repo is a "fullstack" train + inference solution for Llama 2 LLM, with focus on minimalism and simplicity.
 
-Please note that this started recently as just a fun weekend project: I took my earlier [nanoGPT](https://github.com/karpathy/nanoGPT), tuned it to implement the Llama-2 architecture instead of GPT-2, and the meat of it was writing the C inference engine in [run.c](run.c). So the project is young and moving quickly. Hat tip to the awesome [llama.cpp](https://github.com/ggerganov/llama.cpp) for inspiring this project. I wanted something super minimal so I chose to hard-code the Llama 2 architecture, stick to fp32, and just roll one inference file of pure C with no dependencies.
+As the architecture is identical, you can also load and inference Meta's Llama 2 models. However, the current code only inferences models in fp32, so you will most likely not be able to productively load models larger than 7B. Work on model quantization is currently ongoing.
+
+Please note that this repo started recently as a fun weekend project: I took my earlier [nanoGPT](https://github.com/karpathy/nanoGPT), tuned it to implement the Llama-2 architecture instead of GPT-2, and the meat of it was writing the C inference engine in [run.c](run.c). So the project is young and moving quickly. Hat tip to the awesome [llama.cpp](https://github.com/ggerganov/llama.cpp) for inspiring this project. Compred to llama.cpp, I wanted something super simple, minimal, and educational so I chose to hard-code the Llama 2 architecture and just roll one inference file of pure C with no dependencies.
 
 ## feel the magic
 

From 82ad2ba34ead544883ac84248c2dbd98a690c0aa Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Mon, 14 Aug 2023 05:53:57 +0000
Subject: [PATCH 30/30] remove tiktoken as dependency

---
 requirements.txt | 1 -
 sample.py        | 1 -
 2 files changed, 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e3f97c4..7187a73 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,6 @@ numpy==1.23.5
 pytest==7.4.0
 Requests==2.31.0
 sentencepiece==0.1.99
-tiktoken==0.3.3
 torch==2.0.1
 tqdm==4.64.1
 wandb==0.15.5
diff --git a/sample.py b/sample.py
index 64bb177..b26e277 100644
--- a/sample.py
+++ b/sample.py
@@ -5,7 +5,6 @@ import os
 import pickle
 from contextlib import nullcontext
 import torch
-import tiktoken
 from model import ModelArgs, Transformer
 from tokenizer import Tokenizer