From fe9b9f2f15eda96507837f1b2584e98401a61930 Mon Sep 17 00:00:00 2001
From: Jani Monoses <jani.monoses@gmail.com>
Date: Wed, 23 Aug 2023 17:28:14 +0300
Subject: [PATCH 1/6] Train vocab in Python

---
 README.md      |  2 +-
 tinystories.py | 26 ++++++++++++++++----------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index e9df1f6..f4e20e9 100644
--- a/README.md
+++ b/README.md
@@ -163,7 +163,7 @@ python tinystories.py train_vocab --vocab_size=4096
 python tinystories.py pretokenize --vocab_size=4096
 ```
 
-The `train_vocab` stage will call the `train_vocab.sh` script, which calls the `sentencepiece` library to train the tokenizer, storing it in a new file `data/tok4096.model`. I tried to reproduce as well as I could the settings that (I think) Meta used to train their vocabulary. This uses the Byte Pair Encoding algorithm that starts out with raw utf8 byte sequences of the text data and then iteratively merges the most common consecutive pairs of tokens to form the vocabulary. Inspect the `tinystories.py` file - the custom tokenizers are stored in a special directory structure indexed by the vocab size.
+The `train_vocab` stage will call the `sentencepiece` library to train the tokenizer, storing it in a new file `data/tok4096.model`. I tried to reproduce as well as I could the settings that (I think) Meta used to train their vocabulary. This uses the Byte Pair Encoding algorithm that starts out with raw utf8 byte sequences of the text data and then iteratively merges the most common consecutive pairs of tokens to form the vocabulary. Inspect the `tinystories.py` file - the custom tokenizers are stored in a special directory structure indexed by the vocab size.
 
 A quick note of interest is that vocab size of 4096 trained specifically on tinystories creates integer sequences with about the same sequence length per example as the default Llama 2 tokenizer of 32000 tokens! This means that our custom, tailored tokenizer is a lot better adapted to our specific text, and can compress it very effectively. So our trained models are smaller and faster.
 
diff --git a/tinystories.py b/tinystories.py
index 90d576b..003b1e3 100644
--- a/tinystories.py
+++ b/tinystories.py
@@ -13,6 +13,7 @@ from functools import partial
 
 import numpy as np
 import requests
+import sentencepiece as spm
 import torch
 import torch.distributed as dist
 from tqdm import tqdm
@@ -97,16 +98,21 @@ def train_vocab(vocab_size):
                 of.write(text + "\n")
     print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")
 
-    # 2) run the train_vocab.sh script that trains the sentencepiece model
-    print("Will now train the vocab with:")
-    cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}"
-    print(cmd)
-    print("OK? [y/N] ")
-    dec = input()
-    if dec.lower() != "y":
-        print("Exiting...")
-        return
-    os.system(cmd)
+    # 2) train the sentencepiece model
+    print("Will now train the vocab...")
+
+    spm.SentencePieceTrainer.train(input=tiny_file,
+                                   model_prefix=prefix,
+                                   model_type="bpe",
+                                   vocab_size=vocab_size,
+                                   self_test_sample_size=0,
+                                   input_format="text",
+                                   character_coverage=1.0,
+                                   split_digits=True,
+                                   allow_whitespace_only_pieces=True,
+                                   byte_fallback=True,
+                                   unk_surface=r" \342\201\207 ",
+                                   normalization_rule_name="identity")
 
     # 3) optional cleanup, ask the user if they'd like to delete tiny.txt
     dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")

From 9bc72acab041f471b655a763206a24513e8313d9 Mon Sep 17 00:00:00 2001
From: Ali Nehzat <ali.nehzat@thanks.dev>
Date: Thu, 24 Aug 2023 09:09:16 +1000
Subject: [PATCH 2/6] steps shouldn't exceed the model's seq_len either

---
 run.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run.c b/run.c
index da1fbc4..0eaa655 100644
--- a/run.c
+++ b/run.c
@@ -839,7 +839,7 @@ int main(int argc, char *argv[]) {
     // build the Transformer via the model .bin file
     Transformer transformer;
     build_transformer(&transformer, checkpoint_path);
-    if (steps == 0) steps = transformer.config.seq_len; // ovrerride to ~max length
+    if (steps == 0 || steps > transformer.config.seq_len) steps = transformer.config.seq_len; // ovrerride to ~max length
 
     // build the Tokenizer via the tokenizer .bin file
     Tokenizer tokenizer;

From 096325b66c2ab84095bd407cbab84d731edc65bc Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 24 Aug 2023 03:09:55 +0000
Subject: [PATCH 3/6] bring back num_threads

---
 tinystories.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tinystories.py b/tinystories.py
index 003b1e3..800d73a 100644
--- a/tinystories.py
+++ b/tinystories.py
@@ -100,7 +100,6 @@ def train_vocab(vocab_size):
 
     # 2) train the sentencepiece model
     print("Will now train the vocab...")
-
     spm.SentencePieceTrainer.train(input=tiny_file,
                                    model_prefix=prefix,
                                    model_type="bpe",
@@ -108,6 +107,7 @@ def train_vocab(vocab_size):
                                    self_test_sample_size=0,
                                    input_format="text",
                                    character_coverage=1.0,
+                                   num_threads=os.cpu_count(),
                                    split_digits=True,
                                    allow_whitespace_only_pieces=True,
                                    byte_fallback=True,

From d7cd98633dcc50c9e58f4b39b105fe9f9494cf85 Mon Sep 17 00:00:00 2001
From: Andrej <andrej.karpathy@gmail.com>
Date: Thu, 24 Aug 2023 09:04:52 -0700
Subject: [PATCH 4/6] add todo item to add a PyTorch Engine

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f4e20e9..1652b4c 100644
--- a/README.md
+++ b/README.md
@@ -327,9 +327,10 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
 - support Llama 2 7B Chat models with a Chat UI/UX in run.c, very similar to llama.cpp
 - ability to calculate perplexity in run.c, exactly as done in llama.cpp
 - add support in run.c of reading version 1+ files from export, later deprecate "version 0"
-- add more tests inside [test.c](test.c) (call for help!)
+- add more tests in [test.c](test.c)
 - runq.c (int8 quantization) add
 - run.cu (CUDA) investigate and merge
+- add an Engine class that serves the model ~efficiently but in PyTorch (see [Issue 346](https://github.com/karpathy/llama2.c/issues/346))
 - make it easier to add a new dataset with not too much pain
 - (LoRA) finetuning and export of Llama 2 models
 

From 19cfbeca71f8d65d3b19a38e4dec9f4e37a731cf Mon Sep 17 00:00:00 2001
From: Diego Marcos Segura <diego.marcos@gmail.com>
Date: Thu, 24 Aug 2023 19:45:23 -0700
Subject: [PATCH 5/6] Fix typo in README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1652b4c..d08f611 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ Train the Llama 2 LLM architecture in PyTorch then inference it with one simple
 
 As the architecture is identical, you can also load and inference Meta's Llama 2 models. However, the current code only inferences models in fp32, so you will most likely not be able to productively load models larger than 7B. Work on model quantization is currently ongoing.
 
-Please note that this repo started recently as a fun weekend project: I took my earlier [nanoGPT](https://github.com/karpathy/nanoGPT), tuned it to implement the Llama-2 architecture instead of GPT-2, and the meat of it was writing the C inference engine in [run.c](run.c). So the project is young and moving quickly. Hat tip to the awesome [llama.cpp](https://github.com/ggerganov/llama.cpp) for inspiring this project. Compred to llama.cpp, I wanted something super simple, minimal, and educational so I chose to hard-code the Llama 2 architecture and just roll one inference file of pure C with no dependencies.
+Please note that this repo started recently as a fun weekend project: I took my earlier [nanoGPT](https://github.com/karpathy/nanoGPT), tuned it to implement the Llama-2 architecture instead of GPT-2, and the meat of it was writing the C inference engine in [run.c](run.c). So the project is young and moving quickly. Hat tip to the awesome [llama.cpp](https://github.com/ggerganov/llama.cpp) for inspiring this project. Compared to llama.cpp, I wanted something super simple, minimal, and educational so I chose to hard-code the Llama 2 architecture and just roll one inference file of pure C with no dependencies.
 
 ## feel the magic
 

From 6def77d4baef39b9878643001f473e49af405c7c Mon Sep 17 00:00:00 2001
From: Markus Zhang <45726499+photomz@users.noreply.github.com>
Date: Fri, 25 Aug 2023 17:12:29 +0800
Subject: [PATCH 6/6] Correct WandB log step

---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index e958538..e8321d8 100644
--- a/train.py
+++ b/train.py
@@ -271,7 +271,7 @@ while True:
                         "loss/val": losses["val"],
                         "lr": lr,
                         "mfu": running_mfu * 100,  # convert to percentage
-                    }
+                    }, step = iter_num
                 )
             except Exception as e:
                 print(f"logging to wandb failed: {e}")