add avx2 intrinsics maybe

2023-08-10 15:01:53 +00:00
20 changed files with 801 additions and 1788 deletions
@@ -4,12 +4,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/**', '**/Makefile', '**/*.c', '**/*.h', '**/*.py']
+    paths: ['.github/workflows/**', '**/Makefile', '**/*.c', '**/*.h']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/Makefile', '**/*.c', '**/*.h', '**/*.py']
-  # for manual triggering
-  workflow_dispatch:
+    paths: ['**/Makefile', '**/*.c', '**/*.h']

 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -17,7 +15,7 @@ env:
 jobs:
  # check basic builds to avoid breaking changes
  ubuntu-focal-make:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-20.04

    steps:
      - name: Clone
@@ -30,16 +28,6 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential -y

-      - name: Set up Python 3.10
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.10"
-
-      - name: Pip setup
-        run: |
-          python -m pip install --upgrade pip
-          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-
      - name: Build
        id: make_build
        run: |
@@ -50,10 +38,6 @@ jobs:
        run: |
          make runfast

-      - name: Test with pytest
-        run: |
-          pytest
-
  macOS-latest-make:
    runs-on: macos-latest

@@ -68,21 +52,6 @@ jobs:
        run: |
          brew update

-      - name: Set up Python 3.10
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.10"
-
-      - name: Pip setup
-        run: |
-          python -m pip install --upgrade pip
-          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-
-      - name: Build clang
-        id: make_build_clang
-        run: |
-          make run CC=clang
-
      - name: Build
        id: make_build
        run: |
@@ -93,17 +62,15 @@ jobs:
        run: |
          make runfast

-      - name: Test with pytest
-        run: pytest
-
-
-
+      - name: Build clang
+        id: make_build_clang
+        run: |
+          make run CC=clang

  windows-latest-make:
    runs-on: windows-latest

    strategy:
-      fail-fast: false  #necessary, otherwise the matrix breaks
      matrix:
        arch:
          - amd64
@@ -123,30 +90,11 @@ jobs:
        with:
          arch: ${{ matrix.arch }}

-      - name: Set up Python 3.10
-        if: matrix.arch != 'amd64_arm64'
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.10"
-
-      - name: Pip setup
-        if: matrix.arch != 'amd64_arm64'
-        run: |
-          python -m pip install --upgrade pip
-          if (Test-Path requirements.txt) {
-            pip install -r requirements.txt
-          }
-
      - name: Build ${{ matrix.arch }}
        id: build_msvc
        run: |
          .\build_msvc.bat

-      #cross-comiled, cannot be run on host
-      - name: Test with pytest
-        if: matrix.arch != 'amd64_arm64'
-        run: pytest
-
  windows-latest-mingw:
    runs-on: windows-latest

@@ -174,20 +122,3 @@ jobs:
        id: build_mingw
        run: |
          make win64
-
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.10"
-
-      - name: Pip setup
-        shell: powershell
-        run: |
-          python -m pip install --upgrade pip
-          if (Test-Path requirements.txt) {
-            pip install -r requirements.txt
-          }
-
-      - name: Test with pytest
-        shell: powershell
-        run: pytest
@@ -32,6 +32,15 @@ runfast: run.c
 runomp: run.c
 	$(CC) -Ofast -fopenmp -march=native run.c  -lm  -o run

+# compile with AVX2 intrinsics enabled
+.PHONY: runavx2
+runavx2: run.c
+	$(CC) -Ofast -march=native -mavx2 -DLLAMAC_AVX2 -o run run.c -lm
+
+.PHONY: runompavx2
+ runompavx2: run.c
+	$(CC) -Ofast -fopenmp -march=native -mavx2 -DLLAMAC_AVX2 run.c  -lm  -o run
+
 .PHONY: win64
 win64:
 	x86_64-w64-mingw32-gcc -Ofast -D_WIN32 -o run.exe -I. run.c win.c
@@ -45,24 +54,6 @@ rungnu:
 runompgnu:
 	$(CC) -Ofast -fopenmp -std=gnu11 run.c  -lm  -o run

-# run all tests
-.PHONY: test
-test:
-	pytest
-
-# run only tests for run.c C implementation (is a bit faster if only C code changed)
-.PHONY: testc
-testc:
-	pytest -k runc
-
-# run the C tests, without touching pytest / python
-# to increase verbosity level run e.g. as `make testcc VERBOSITY=1`
-VERBOSITY ?= 0
-.PHONY: testcc
-testcc:
-	$(CC) -DVERBOSITY=$(VERBOSITY) -O3 -o testc test.c -lm
-	./testc
-
 .PHONY: clean
 clean:
 	rm -f run
@@ -4,11 +4,9 @@
  <img src="assets/llama_cute.jpg" width="300" height="300" alt="Cute Llama">
 </p>

-Train the Llama 2 LLM architecture in PyTorch then inference it with one simple 700-line C file ([run.c](run.c)). You might think that you need many billion parameter LLMs to do anything useful, but in fact very small LLMs can have surprisingly strong performance if you make the domain narrow enough (ref: [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) paper). This repo is a "fullstack" train + inference solution for Llama 2 LLM, with focus on minimalism and simplicity.
+With the code in this repo you can train the Llama 2 LLM architecture from scratch in PyTorch, then export the weights to a binary file, and load that into one ~simple 500-line C file ([run.c](run.c)) that inferences the model. Alternatively, you can load, finetune, and inference Meta's Llama 2 (but this is still being actively fleshed out). Hence, this repo is a "fullstack" train + inference solution for Llama 2 LLM, with a focus on minimalism and simplicity. You might think that you need many billion parameter LLMs to do anything useful, but in fact very small LLMs can have surprisingly strong performance if you make the domain narrow enough. I recommend looking at the [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) paper for inspiration.

-As the architecture is identical, you can also load and inference Meta's Llama 2 models. However, the current code only inferences models in fp32, so you will most likely not be able to productively load models larger than 7B. Work on model quantization is currently ongoing.
-
-Please note that this repo started recently as a fun weekend project: I took my earlier [nanoGPT](https://github.com/karpathy/nanoGPT), tuned it to implement the Llama-2 architecture instead of GPT-2, and the meat of it was writing the C inference engine in [run.c](run.c). So the project is young and moving quickly. Hat tip to the awesome [llama.cpp](https://github.com/ggerganov/llama.cpp) for inspiring this project. Compared to llama.cpp, I wanted something super simple, minimal, and educational so I chose to hard-code the Llama 2 architecture and just roll one inference file of pure C with no dependencies.
+Please note that this started recently as just a fun weekend project: I took my earlier [nanoGPT](https://github.com/karpathy/nanoGPT), tuned it to implement the Llama-2 architecture instead of GPT-2, and the meat of it was writing the C inference engine in [run.c](run.c). So the project is young and moving quickly. Hat tip to the awesome [llama.cpp](https://github.com/ggerganov/llama.cpp) for inspiring this project. I wanted something super minimal so I chose to hard-code the Llama 2 architecture, stick to fp32, and just roll one inference file of pure C with no dependencies.

 ## feel the magic

@@ -58,20 +56,18 @@ You can also prompt the model with a prefix or a number of additional command li

 > One day, Lily met a Shoggoth. He was very shy, but was also very generous. Lily said “Hello Shoggy! Can I be your friend?” Shoggy was happy to have a friend and said “Yes, let’s explore the universe together!” So they set off on a journey to explore the universe. As they travelled, Shoggy was happy to explain to Lily about all the wonderful things in the universe. At the end of the day, Lily and Shoggy had gathered lots of wonderful things from the universe, and they both felt very proud. They promised to explore the universe as one big pair and to never stop being generous to each other.

-There is also an even better 110M param model available, see [models](#models).
-
-Quick note on sampling, the recommendation for ~best results is to sample with `-t 1.0 -p 0.9`, i.e. temperature 1.0 (default) but also top-p sampling at 0.9 (default). Intuitively, top-p ensures that tokens with tiny probabilities do not get sampled, so we can't get "unlucky" during sampling, and we are less likely to go "off the rails" afterwards. More generally, to control the diversity of samples use either the temperature (i.e. vary `-t` between 0 and 1 and keep top-p off with `-p 0`) or the top-p value (i.e. vary `-p` between 0 and 1 and keep `-t 1`), but not both. Nice explainers on LLM sampling strategies include [this](https://peterchng.com/blog/2023/05/02/token-selection-strategies-top-k-top-p-and-temperature/), [this](https://docs.cohere.com/docs/controlling-generation-with-top-k-top-p) or [this](https://huggingface.co/blog/how-to-generate).
+There is also an even better 110M param model available, see [models](#models). Quick note on sampling, the recommendation for good results is to use `-t 1.0 -p 0.9`, i.e. top-p sampling at 0.9 with temperature 1.0 (this is the default). To control the diversity of samples use either the temperature (i.e. vary `-t` between 0 and 1 and keep top-p off with `-p 0`) or the top-p value (i.e. vary `-p` between 0 and 1 and keep `-t 1`), but not both. Nice explainers on LLM sampling strategies include [this](https://peterchng.com/blog/2023/05/02/token-selection-strategies-top-k-top-p-and-temperature/), [this](https://docs.cohere.com/docs/controlling-generation-with-top-k-top-p) or [this](https://huggingface.co/blog/how-to-generate).

 ## Meta's Llama 2 models

 As the neural net architecture is identical, we can also inference the Llama 2 models released by Meta. Sadly there is a bit of friction here due to licensing (I can't directly upload the checkpoints, I think). So Step 1, get the Llama 2 checkpoints by following the [Meta instructions](https://github.com/facebookresearch/llama). Once we have those checkpoints, we have to convert them into the llama2.c format.
-For this we need to install the python dependencies (`pip install -r requirements.txt`) and then use the `export.py` file, e.g. for 7B model:
+For this we need to install the python dependencies (`pip install -r requirements.txt`) and then use the `export_meta_llama_bin.py` file, e.g. for 7B model:

 ```bash
-python export.py llama2_7b.bin --meta-llama path/to/llama/model/7B
+python export_meta_llama_bin.py path/to/llama/model/7B llama2_7b.bin
 ```

-The export will take ~10 minutes or so and generate a 26GB file (the weights of the 7B model in float32) called `llama2_7b.bin` in the current directory. It has been [reported](https://github.com/karpathy/llama2.c/pull/85) that despite efforts. I would not attempt to run anything above 7B right now for two reasons: first, 13B+ currently doesn't work because of integer flow in pointer arithmetic, which is yet to be fixed, and second, even if it were fixed, this repo is doing float32 inference right now, so it would be fairly unusably slow. Once the export is done, we can run it:
+The export will take ~10 minutes or so and generate a 26GB file (the weights of the 7B model in float32) called `llama2_7b.bin` in the current directory. It has been [reported](https://github.com/karpathy/llama2.c/pull/85) that despite efforts, the 13B export currently doesn't work for unknown reasons (accepting PRs for fix). We can run the model as normal:

 ```bash
 ./run llama2_7b.bin
@@ -83,48 +79,15 @@ This ran at about 4 tokens/s compiled with [OpenMP](#OpenMP) on 96 threads on my

 base models... ¯\\_(ツ)_/¯. Since we can inference the base model, it should be possible to also inference the chat model quite easily, and have a conversation with it. And if we can find a way to run 7B more efficiently, we can start adding LoRA to our training script, and going wild with finetunes all within the repo!

-You can also chat with the Llama Chat models. Export the chat model exactly as above:
-
-```bash
-python export.py llama2_7b_chat.bin --meta-llama /path/to/7B-chat
-```
-
-Then chat with it by specifying the chat mode using the `-m` flag, e.g.:
-
-```bash
-./run llama2_7b_chat.bin -m chat
-```
-
-You can also try Meta's Code Llama models even if support for them is incomplete. In particular, some hyperparameters changed (e.g. the constant in RoPE layer), so the inference is not exactly correct and a bit buggy right now. Looking into fixes. Make sure to build the tokenizer for the plain and instruct variants and pass it when doing inference.
-
-```bash
-python export.py codellama2_7b.bin --meta-llama /path/to/CodeLlama-7b
-python tokenizer.py --tokenizer-model=/path/to/CodeLlama-7b/tokenizer.model
-./run codellama2_7b.bin -z /path/to/CodeLlama-7b/tokenizer.bin
-```
-
-Chat with Code Llama Instruct:
-
-```bash
-python export.py codellama2_7b_instruct.bin --meta-llama /path/to/CodeLlama-7b-Instruct
-python tokenizer.py --tokenizer-model=/path/to/CodeLlama-7b-Instruct/tokenizer.model
-./run codellama2_7b_instruct.bin -m chat -z /path/to/CodeLlama-7b-Instruct/tokenizer.bin
-```
-
-## hugginface models
-
-We can load any huggingface models that use the Llama 2 architecture. See the script [export.py](export.py) and the `--hf` flag to export the model .bin file.
-
 ## models

 For the sake of examples of smaller, from-scratch models, I trained a small model series on TinyStories. All of these trained in a few hours on my training setup (4X A100 40GB GPUs). The 110M took around 24 hours. I am hosting them on huggingface hub [tinyllamas](https://huggingface.co/karpathy/tinyllamas), both in the original PyTorch .pt, and also in the llama2.c format .bin:

-| model | dim | n_layers | n_heads | n_kv_heads | max context length | parameters | val loss | download
-| --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| 260K | 64 | 5 | 8 | 4 | 512 | 260K | 1.297 | [stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K)
-| OG | 288 | 6 | 6 | 6 | 256 | 15M | 1.072 | [stories15M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin) |
-| 42M| 512 | 8 | 8 | 8 | 1024 | 42M | 0.847 | [stories42M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.bin) |
-| 110M| 768 | 12 | 12 | 12 | 1024 | 110M | 0.760 | [stories110M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.bin) |
+| model | dim | n_layers | n_heads | max context length | parameters | val loss | download
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| OG | 288 | 6 | 6 | 256 | 15M | 1.072 | [stories15M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin) |
+| 42M| 512 | 8 | 8 | 1024 | 42M | 0.847 | [stories42M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.bin) |
+| 110M| 768 | 12 | 12 | 1024 | 110M | 0.760 | [stories110M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.bin) |

 You'll notice that the 110M model is equivalent to GPT-1 in size. Alternatively, this is also the smallest model in the GPT-2 series (`GPT-2 small`), except the max context length is only 1024 instead of 2048. The only notable changes from GPT-1/2 architecture is that Llama uses RoPE relatively positional embeddings instead of absolute/learned positional embeddings, a bit more fancy SwiGLU non-linearity in the MLP, RMSNorm instead of LayerNorm, bias=False on all Linear layers, and is optionally multiquery (but this is not yet supported in llama2.c).

@@ -167,53 +130,15 @@ Watch the tokens stream by, fun! We can also run the PyTorch inference script fo

 ```bash
 wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt -P out15M
-python sample.py --checkpoint=out15M/stories15M.pt
+mv out15M/stories15M.pt out15M/ckpt.pt # sorry the sample script current assumes this directory structure / filename...
+python sample.py --out_dir=out15M
 ```

-Which gives the same results.
-
-## custom tokenizers
-
-In everything above, we've assumed the custom Lllama 2 tokenizer with 32,000 tokens. However, in many boutique LLMs, using vocabulary this big might be an overkill. If you have a small application you have in mind, you might be much better off training your own tokenizers. This can make everything nicer - with smaller vocabs your model has fewer parameters (because the token embedding table is a lot smaller), the inference is faster (because there are fewer tokens to predict), and your average sequence length per example could also get smaller (because the compression is a lot more efficient on your data). So let's see how we train a custom tokenizer.
-
-By default, to pretokenize the tinystories dataset we had to run, in order:
+Which gives the same results. More detailed testing will be done in `test_all.py`. Currently you will need two files to test or sample: both the .bin file, and the .ckpt file inside a directory (see `test_all.py` for details). Sorry this is a bit janky right now, I have to think through running the tests without having to download 200MB of data. But run the tests with pytest:

+```bash
+$ pytest
 ```
-python tinystories.py download
-python tinystories.py pretokenize
-```
-
-The `pretokenize` stage here loads the Llama 2 tokenizer (vocab size 32,000) and uses it to convert the downloaded text into integers, and saves that to file. We now change this as follows, to train an example 4096-token tokenizer:
-
-```
-python tinystories.py download
-python tinystories.py train_vocab --vocab_size=4096
-python tinystories.py pretokenize --vocab_size=4096
-```
-
-The `train_vocab` stage will call the `sentencepiece` library to train the tokenizer, storing it in a new file `data/tok4096.model`. I tried to reproduce as well as I could the settings that (I think) Meta used to train their vocabulary. This uses the Byte Pair Encoding algorithm that starts out with raw utf8 byte sequences of the text data and then iteratively merges the most common consecutive pairs of tokens to form the vocabulary. Inspect the `tinystories.py` file - the custom tokenizers are stored in a special directory structure indexed by the vocab size.
-
-A quick note of interest is that vocab size of 4096 trained specifically on tinystories creates integer sequences with about the same sequence length per example as the default Llama 2 tokenizer of 32000 tokens! This means that our custom, tailored tokenizer is a lot better adapted to our specific text, and can compress it very effectively. So our trained models are smaller and faster.
-
-Now that we have pretokenized the dataset with our custom tokenizer, we can train the model. The training script `train.py` doesn't care about the exact tokens, it only cares about the vocabulary size so it can correctly initialize the model. So when training your model, make sure to pass in
-
-```
-python train.py --vocab_source=custom --vocab_size=4096
-```
-
-(The defaults are `llama2` and `32000` respectively, which indicates the default Llama 2 tokenizer). This trains the model. Finally we are ready to run inference with our `run.c` script. For that we need two things. Number one, we have to export our tokenizer in the `.bin` format, do that with:
-
-```
-python tokenizer.py --tokenizer-model=data/tok4096.model
-```
-
-This writes the tokenizer to `data/tok4096.bin`. Now we can run inference, pointing it to this tokenizer using the `-z` flag:
-
-```
-./run out/model.bin -z data/tok4096.bin
-```
-
-This should print the samples. If you leave out the `-z` flag, it will use the default Llama 2 tokenizer, which would generate a good sequence of integers, but they would get translated using a different vocabulary to text, so it would look like gibberish.

 ## performance

@@ -235,8 +160,9 @@ You can also experiment with replacing `gcc` with `clang`.

 If compiling with gcc, try experimenting with `-funroll-all-loops`, see PR [#183](https://github.com/karpathy/llama2.c/pull/183)

-**OpenMP**. Big improvements can also be achieved by compiling with OpenMP, which "activates" the `#pragma omp parallel for` inside the matmul and attention, allowing the work in the loops to be split up over multiple processors.
-You'll need to install the OpenMP library and the clang compiler first (e.g. `apt install clang libomp-dev` on ubuntu). Then you can compile with `make runomp`, which does:
+### OpenMP
+Big improvements can also be achieved by compiling with OpenMP, which "activates" the `#pragma omp parallel for` inside the matmul and attention, allowing the work in the loops to be split up over multiple processors.
+You'll need to install the OpenMP library and the clang compiler first (e.g. `apt install clang libomp-dev` on ubuntu). I was not able to get improvements from OpenMP on my MacBook, though. Then you can compile with `make runomp`, which does:

 ```bash
 clang -Ofast -fopenmp -march=native run.c  -lm  -o run
@@ -248,8 +174,7 @@ When you run inference make sure to use OpenMP flags to set the number of thread
 OMP_NUM_THREADS=4 ./run out/model.bin
 ```

-Depending on your system resources you may want to tweak these hyperparameters and use more threads. But more is not always better, usually this is a bit U shaped. In particular, if your CPU has SMT (multithreading), try setting the number of threads to the number of physical cores rather than logical cores. The performance difference can be large due to cache thrashing and communication overhead. The PyTorch documentation [CPU specific optimizations
-](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#cpu-specific-optimizations) has some good information that applies here too.
+Depending on your system resources you may want to tweak these hyperparameters and use more threads. But more is not always better, usually this is a bit U shaped.

 ## platforms

@@ -257,27 +182,6 @@ On **Windows**, use `build_msvc.bat` in a Visual Studio Command Prompt to build

 On **Centos 7**, **Amazon Linux 2018** use `rungnu` Makefile target: `make rungnu` or `make runompgnu` to use openmp.

-On **Mac**, use clang from brew for openmp build. Install clang as `brew install llvm` and use the installed clang binary to compile with openmp: `make runomp CC=/opt/homebrew/opt/llvm/bin/clang`
-
-## tests
-
-You can run tests simply with pytest:
-
-```bash
-$ pip install pytest
-$ pytest
-```
-
-This will currently invoke two tests inside `test_all.py`, which forward the model in both C and Python for 200 steps and check the output against a known good expected output. The tests currently run in only a few seconds, but will have to download and cache the stories260K models in a temporary `test` directory (only ~2MB download).
-
-There are also some tests in C, in the file [test.c](test.c). You can run these with `make testcc`, or to see more stuff printed:
-
-```
-make testcc VERBOSITY=1
-```
-
-Call for help: help add more tests.
-
 ## ack

 I trained the llama2.c storyteller models on a 4X A100 40GB box graciously provided by the excellent [Lambda labs](https://lambdalabs.com/service/gpu-cloud), thank you.
@@ -310,8 +214,6 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
  - [llama2.rs](https://github.com/gaxler/llama2.rs) by @[gaxler](https://github.com/gaxler): a Rust port of this project
  - [llama2.rs](https://github.com/leo-du/llama2.rs) by @[leo-du](https://github.com/leo-du): A Rust port of this project
  - [llama2-rs](https://github.com/danielgrittner/llama2-rs) by @[danielgrittner](https://github.com/danielgrittner): a Rust port of this project
-  - [llama2.rs](https://github.com/lintian06/llama2.rs) by @[lintian06](https://github.com/lintian06): A Rust port of this project
-  - [pecca.rs](https://github.com/rahoua/pecca-rs) by @[rahoua](https://github.com/rahoua): A Rust port leveraging [ndarray](https://github.com/rust-ndarray/ndarray), supports BLAS.
 - Go
  - [go-llama2](https://github.com/tmc/go-llama2) by @[tmc](https://github.com/tmc): a Go port of this project
  - [llama2.go](https://github.com/nikolaydubina/llama2.go) by @[nikolaydubina](https://github.com/nikolaydubina): a Go port of this project
@@ -324,7 +226,6 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
  - [llama2.cpp](https://github.com/leloykun/llama2.cpp) by @[leloykun](https://github.com/leloykun): a C++ port of this project
 - JavaScript
  - [llama2.js](https://github.com/epicure/llama2.js) by @[epicure](https://github.com/epicure): a JavaScript port of this project
-  - [llama2.ts](https://github.com/wizzard0/llama2.ts) by @[oleksandr_now](https://twitter.com/oleksandr_now): a TypeScript port of this project. Full Llama2-7B capable.
  - [llama2.c-emscripten](https://github.com/gohai/llama2.c-emscripten) by @[gohai](https://github.com/gohai): Emscripten (JavaScript) port, based on @ggerganov's initial prototype
 - Zig
  - [llama2.zig](https://github.com/cgbur/llama2.zig) by @[cgbur](https://github.com/cgbur): A Zig port of this project
@@ -342,21 +243,18 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
  - [llama2.py](https://github.com/tairov/llama2.py) by @[tairov](https://github.com/tairov): a simple one file pure Python port of this project with zero dependencies
 - C#
  - [llama2.cs](https://github.com/trrahul/llama2.cs) by @[trrahul](https://github.com/trrahul): a C# port of this project
- Dart
-  - [llama2.dart](https://github.com/yiminghan/llama2.dart) by @[yiminghan](https://github.com/yiminghan/llama2.dart): one-file dart port of this project, works with Flutter!
- WebAssembly
-  - [icpp-llm](https://github.com/icppWorld/icpp-llm): LLMs for the Internet Computer
 - [llama2.c - Llama 2 Everywhere](https://github.com/trholding/llama2.c) by @[trholding](https://github.com/trholding): Standalone, Bootable & Portable Binary Llama 2
- [llama2.c-zh - Bilingual Chinese and English](https://github.com/chenyangMl/llama2.c-zh) by @[chenyangMl](https://github.com/chenyangMl): Expand tokenizer to support training and inference in both Chinese and English

 ## unsorted todos

- add support in run.c of reading version 1+ files from export, later deprecate "version 0"
- runq.c (int8 quantization) add
- run.cu (CUDA) investigate and merge
- add more tests inside [test.c](test.c)
- add Engine class for use in sample.py that does efficient inference in PyTorch, e.g. KV cache keeping
- make it easier to add a new dataset with not too much pain
+- add multiquery support into run.c
+- add custom bpe training code and the ability to train a smaller vocabulary (32K is to much)
+- should calculate freq_cis online in the script run.c instead of loading them
+- int4/8 quantization
+- export the model in a more sensible output format with a proper header, etc.
+- train a tiny Llama test model (committed to repo) and use it as reference in unit tests
+- support Llama 2 7B Chat models and tune run.c to Chat UI/UX
+- llama2.cu investigate and merge
 - (LoRA) finetuning and export of Llama 2 models

 ## License
@@ -1,58 +0,0 @@
-# stories260K
-
-[Stories260K huggginface link](https://huggingface.co/karpathy/tinyllamas)
-
-The 260K model is a tiny model used for testing, and was trained as follows:
-
-```
-python train.py \
-    --out_dir="outmini" \
-    --batch_size=128 \
-    --max_seq_len=512 \
-    --gradient_accumulation_steps=1 \
-    --vocab_source="custom" \
-    --vocab_size=512 \
-    --dim=64 \
-    --n_layers=5 \
-    --n_heads=8 \
-    --n_kv_heads=4 \
-    --multiple_of=4 \
-    --learning_rate=1e-3 \
-    --dropout=0.05 \
-    --weight_decay=0.01 \
-    --max_iters=100000 \
-    --beta2=0.99 \
-    --warmup_iters=1000 \
-    --eval_interval=2000 \
-    --eval_iters=100 \
-    --compile=True
-```
-
-You'll notice that `n_kv_heads` is 4 while `n_heads` is 8, so two heads at a time share their key,value projections, i.e. this model is 2X multiquery. You'll also notice that we're using a custom tokenizer with 512 tokens. The model trained for ~10 minutes (?) on my A100 and achieves validation loss of 1.2968.
-
-Sampling this model at temperature 0.0 (i.e. deterministic greedy argmax sampling) gives:
-
-```
-$ ./run stories260K/stories260K.bin -z stories260K/tok512.bin -t 0.0
-Once upon a time, there was a little girl named Lily. She loved to play outside in the park. One day, she saw a big, red ball. She wanted to play with it, but it was too high.
-Lily's mom said, "Lily, let's go to the park." Lily was sad and didn't know what to do. She said, "I want to play with your ball, but I can't find it."
-Lily was sad and didn't know what to do. She said, "I'm sorry, Lily. I didn't know what to do."
-Lily didn't want to help her mom, so she said, "I'm sorry, mom. I didn't know what to do." Her mom said, "Don't worry, Lily. We can help you.
-```
-
-You can reproduce the same in Python by running `sample.py`:
-
-```
-$ python sample.py --checkpoint=stories260K/stories260K.pt --tokenizer=stories260K/tok512.model --temperature=0.0 --max_new_tokens=257
-```
-
-I hardcoded max tokens to be 257 manually because the `sample.py` script doesn't currently terminate on the special BOS token like the run.c script does. Sampling at 1.0 with topp of 0.9 gives a bit more reasonable samples:
-
-```
-$ ./run stories260K/stories260K.bin -z stories260K/tok512.bin -t 1.0 -p 0.9 -s 133742
-Once upon a time, there was a little boy named Timmy. Timmy loved to play with his toys and eat sandwiches. One day, Timmy's mom told him it was time to rest for a while. Timmy's friend Billy came over and took him a down.
-Timmy's mom saw that Timmy was sad, but Timmy said, "I didn't understand what is it! We need to find some leafs." Timmy thought about it and took a deep breath on a spoon. He hoped it was important to be kind and continued to find its image next time.
-After they finished getting, Timmy's dad came up to his house and promised to help Timmy.
-```
-
-Hey you can't expect too much from a 260K parameter model. I'm even mildly shocked we get this far :D
@@ -1,99 +0,0 @@
-# training llama tokenizer
-
-How does Meta train their sentencepiece tokenizer? You can print the config as follows:
-
-```python
-import sentencepiece.sentencepiece_model_pb2
-mp = sentencepiece.sentencepiece_model_pb2.ModelProto()
-mp.ParseFromString(open("tokenizer.model", "rb").read())
-print(mp.trainer_spec)
-print(mp.normalizer_spec)
-```
-
-this gives:
-
-```
-trainer_spec {
-  input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
-  model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
-  model_type: BPE
-  vocab_size: 32000
-  self_test_sample_size: 0
-  input_format: "text"
-  character_coverage: 0.9999499917030334
-  input_sentence_size: 200000000
-  seed_sentencepiece_size: 1000000
-  shrinking_factor: 0.75
-  num_threads: 80
-  num_sub_iterations: 2
-  max_sentence_length: 4192
-  shuffle_input_sentence: true
-  max_sentencepiece_length: 16
-  split_by_unicode_script: true
-  split_by_whitespace: true
-  split_by_number: true
-  treat_whitespace_as_suffix: false
-  split_digits: true
-  allow_whitespace_only_pieces: true
-  vocabulary_output_piece_score: true
-  hard_vocab_limit: true
-  use_all_vocab: false
-  byte_fallback: true
-  required_chars: ""
-  unk_id: 0
-  bos_id: 1
-  eos_id: 2
-  pad_id: -1
-  unk_surface: " \342\201\207 "
-  unk_piece: "<unk>"
-  bos_piece: "<s>"
-  eos_piece: "</s>"
-  pad_piece: "<pad>"
-  train_extremely_large_corpus: false
-  enable_differential_privacy: false
-  differential_privacy_noise_level: 0.0
-  differential_privacy_clipping_threshold: 0
-}
-normalizer_spec {
-  name: "identity"
-  precompiled_charsmap: ""
-  add_dummy_prefix: true
-  remove_extra_whitespaces: false
-  normalization_rule_tsv: ""
-}
-```
-
-We can use the sentencepiece spm_train to train the same models, but optionally smaller. Here are their [options docs](https://github.com/google/sentencepiece/blob/master/doc/options.md) we can refer to. It's not much but it helps.
-
-We'll depart on one setting, I recommend changing `character_coverage` -> 1.0. We also want to make sure to note the following important settings that come up in the paper and are not necessarily the default sentencepiece settings:
-
-```
--split-digits = true
--allow_whitespace_only_pieces = true
--byte_fallback = true
--normalization_rule_name = identity
-```
-
-With this in mind we can train a sentencepiece vocab in what I believe is probably the same to how Meta trained theirs as:
-
-```
-spm_train --input="$input" \
-          --model_prefix="$model_prefix" \
-          --model_type=bpe \
-          --vocab_size="$vocab_size" \
-          --self_test_sample_size=0 \
-          --input_format="text" \
-          --character_coverage=1.0 \
-          --num_threads="$(nproc)" \
-          --split_digits=true \
-          --allow_whitespace_only_pieces=true \
-          --byte_fallback=true \
-          --unk_surface=" \342\201\207 " \
-          --normalization_rule_name=identity \
-```
-
-Where $input is the input file, $model_prefix is the output path prefix, vocab_size is the desired vocab, and we're by default taking over the CPU resources of the machine.
-
-Lastly note that sentencepiece is weird and expects "sentences" delimited by newlines as the input. You can't just put in a massive block of text. And they have a hyperparameter that constols the maximum size of a "sentence". Fwiw I really dislike this design choice around a weird concept of a "sentence". It should just be block of text with no assumptions. But here we are.
-
-Look into the file `tinystories.py` where we train the vocab in the same way, but using Python bindings instead.
@@ -1,471 +0,0 @@
-"""
-This script has functions and utilties for model export.
-Basically, we have a bunch of versions of the model, and we
-want to export them to .bin files to be read from and inferenced in C.
-
-Among the "input" versions of PyTorch files/models:
- Official Llama 2 weights released by Meta
- Huggingface weights available on the hub
- llama2.c (this repo) trained models
-
-Among the "output" versions of .bin files:
- v0: Legacy files of the original llama2.c repo (will eventually be DEPRECATED)
- v1-vN: Improved .bin files with a proper header, cache alignment, etc.
-
-This script aspires to provide all of these conversions.
-"""
-import os
-import gzip
-import shutil
-import struct
-import argparse
-import json
-from pathlib import Path
-
-import numpy as np
-import torch
-from torch import nn
-
-from model import ModelArgs, Transformer
-
-# -----------------------------------------------------------------------------
-# common utilities
-
-def serialize_fp32(file, tensor):
-    """ writes one fp32 tensor to file that is open in wb mode """
-    d = tensor.detach().cpu().view(-1).to(torch.float32).numpy()
-    b = struct.pack(f'{len(d)}f', *d)
-    file.write(b)
-
-def serialize_int8(file, tensor):
-    """ writes one int8 tensor to file that is open in wb mode """
-    d = tensor.detach().cpu().view(-1).numpy().astype(np.int8)
-    b = struct.pack(f'{len(d)}b', *d)
-    file.write(b)
-
-def quantize_q80(w, group_size):
-    """
-    takes a tensor and returns the Q8_0 quantized version
-    i.e. symmetric quantization into int8, range [-127,127]
-    """
-    assert w.numel() % group_size == 0
-    ori_shape = w.shape
-    w = w.float() # convert to float32
-    w = w.reshape(-1, group_size)
-    # find the max in each group
-    wmax = torch.abs(w).max(dim=1).values
-    # calculate the scaling factor such that float = quant * scale
-    scale = wmax / 127.0
-    # scale into range [-127, 127]
-    quant = w / scale[:,None]
-    # round to nearest integer
-    int8val = torch.round(quant).to(torch.int8)
-    # dequantize by rescaling
-    fp32val = (int8val.float() * scale[:,None]).view(-1)
-    fp32valr = fp32val.reshape(-1, group_size)
-    # calculate the max error in each group
-    err = torch.abs(fp32valr - w).max(dim=1).values
-    # find the max error across all groups
-    maxerr = err.max().item()
-    return int8val, scale, maxerr
-
-# -----------------------------------------------------------------------------
-# legacy
-
-def legacy_export(model, filepath):
-    """ Original export of llama2.c bin files, i.e. version v0 """
-    out_file = open(filepath, 'wb')
-
-    # first write out the header
-    hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0]
-    p = model.params
-    shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight)
-    # legacy format uses negative/positive vocab size as a shared classifier flag
-    if not shared_classifier:
-        p.vocab_size = -p.vocab_size
-    n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads
-    header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,
-                                    n_kv_heads, p.vocab_size, p.max_seq_len)
-    out_file.write(header)
-
-    # next write out the embedding weights
-    serialize_fp32(out_file, model.tok_embeddings.weight)
-
-    # now all the layers
-    # attention weights
-    for layer in model.layers:
-        serialize_fp32(out_file, layer.attention_norm.weight)
-    for layer in model.layers:
-        serialize_fp32(out_file, layer.attention.wq.weight)
-    for layer in model.layers:
-        serialize_fp32(out_file, layer.attention.wk.weight)
-    for layer in model.layers:
-        serialize_fp32(out_file, layer.attention.wv.weight)
-    for layer in model.layers:
-        serialize_fp32(out_file, layer.attention.wo.weight)
-    # ffn weights
-    for layer in model.layers:
-        serialize_fp32(out_file, layer.ffn_norm.weight)
-    for layer in model.layers:
-        serialize_fp32(out_file, layer.feed_forward.w1.weight)
-    for layer in model.layers:
-        serialize_fp32(out_file, layer.feed_forward.w2.weight)
-    for layer in model.layers:
-        serialize_fp32(out_file, layer.feed_forward.w3.weight)
-    # final rmsnorm
-    serialize_fp32(out_file, model.norm.weight)
-    # freqs_cis
-    serialize_fp32(out_file, model.freqs_cos[:p.max_seq_len])
-    serialize_fp32(out_file, model.freqs_sin[:p.max_seq_len])
-
-    # final classifier weights
-    if not shared_classifier:
-        serialize_fp32(out_file, model.output.weight)
-
-    # write to binary file
-    out_file.close()
-    print(f"wrote {filepath}")
-
-# -----------------------------------------------------------------------------
-# new version
-
-def version1_export(model, filepath):
-    """
-    Export the model weights in full float32 .bin file to be read from C.
-    This is same as legacy_export, but with a proper header.
-    """
-    version = 1
-
-    out_file = open(filepath, 'wb')
-    # first write out the header. the header will be 256 bytes
-    # 1) write magic, which will be uint32 of "ak42" in ASCII
-    out_file.write(struct.pack('I', 0x616b3432))
-    # 2) write version, which will be int
-    out_file.write(struct.pack('i', version))
-    # 3) write the params, which will be 7 ints
-    p = model.params
-    hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0]
-    n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads
-    header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,
-                                    n_kv_heads, p.vocab_size, p.max_seq_len)
-    out_file.write(header)
-    # 4) write some other flags
-    shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight)
-    out_file.write(struct.pack('B', int(shared_classifier)))
-    pad = 256 - out_file.tell() # pad rest with zeros; tell returns current pos
-    assert pad >= 0
-    out_file.write(b'\0' * pad)
-
-    # now let's write out all the params
-    weights = [
-        *[layer.attention_norm.weight for layer in model.layers],
-        *[layer.ffn_norm.weight for layer in model.layers],
-        model.norm.weight,
-        model.tok_embeddings.weight,
-        *[layer.attention.wq.weight for layer in model.layers],
-        *[layer.attention.wk.weight for layer in model.layers],
-        *[layer.attention.wv.weight for layer in model.layers],
-        *[layer.attention.wo.weight for layer in model.layers],
-        *[layer.feed_forward.w1.weight for layer in model.layers],
-        *[layer.feed_forward.w2.weight for layer in model.layers],
-        *[layer.feed_forward.w3.weight for layer in model.layers],
-    ]
-    if not shared_classifier:
-        weights.append(model.output.weight)
-    for w in weights:
-        serialize_fp32(out_file, w)
-
-    # write to binary file
-    out_file.close()
-    print(f"wrote {filepath}")
-
-def version2_export(model, filepath, group_size=64):
-    """
-    Export the model weights in Q8_0 into .bin file to be read from C.
-    That is:
-    - quantize all weights to symmetric int8, in range [-127, 127]
-    - all other tensors (the rmsnorm params) are kept and exported in fp32
-    - quantization is done in groups of group_size to reduce the effects of any outliers
-    """
-    version = 2
-
-    # let's first do some validation for this export type
-    while model.params.dim % group_size != 0:
-        group_size //= 2
-        print(f"BACKOFF: reducing group size to {group_size} to fit hidden_dim")
-    weights = [
-        model.tok_embeddings.weight,
-        *[layer.attention.wq.weight for layer in model.layers],
-        *[layer.attention.wk.weight for layer in model.layers],
-        *[layer.attention.wv.weight for layer in model.layers],
-        *[layer.attention.wo.weight for layer in model.layers],
-        *[layer.feed_forward.w1.weight for layer in model.layers],
-        *[layer.feed_forward.w2.weight for layer in model.layers],
-        *[layer.feed_forward.w3.weight for layer in model.layers],
-    ]
-    shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight)
-    if not shared_classifier:
-        weights.append(model.output.weight)
-    for w in weights:
-        assert w.numel() % group_size == 0, f"weight {i} has numel {w.numel()}, not a multiple of group_size {group_size}"
-
-    # write
-    out_file = open(filepath, 'wb')
-    # first write out the header. the header will be 256 bytes
-    # 1) write magic, which will be uint32 of "ak42" in ASCII
-    out_file.write(struct.pack('I', 0x616b3432))
-    # 2) write version, which will be int
-    out_file.write(struct.pack('i', version))
-    # 3) write the params, which will be 7 ints
-    p = model.params
-    hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0]
-    n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads
-    header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,
-                                    n_kv_heads, p.vocab_size, p.max_seq_len)
-    out_file.write(header)
-    # 4) write some other flags
-    out_file.write(struct.pack('B', int(shared_classifier)))
-    out_file.write(struct.pack('i', group_size)) # group size used for quantization
-    pad = 256 - out_file.tell() # pad rest with zeros; tell returns current pos
-    assert pad >= 0
-    out_file.write(b'\0' * pad)
-    # now that the header is done, let's write out the model
-
-    # first let's write out all the params that we are keeping in fp32: the norms
-    for layer in model.layers: # attention norms
-        serialize_fp32(out_file, layer.attention_norm.weight)
-    for layer in model.layers: # MLP norms
-        serialize_fp32(out_file, layer.ffn_norm.weight)
-    serialize_fp32(out_file, model.norm.weight) # final pre-classifier norm
-
-    # now let's write out all the params that we are quantizing to Q8_0
-    # note we skip classifier weights, which are shared with the embedding
-    ew = []
-    scales = []
-    for i, w in enumerate(weights):
-        # quantize this weight
-        q, s, err = quantize_q80(w, group_size)
-        # save the int8 weights to file
-        serialize_int8(out_file, q) # save the tensor in int8
-        scales.append(s)  # we'll do all the scales after all the qs
-        # logging
-        ew.append((err, w.shape))
-        print(f"{i+1}/{len(weights)} quantized {tuple(w.shape)} to Q8_0 with max error {err}")
-
-    # save the scaling factors in fp32 here
-    # this is done to keep all the weights contiquous, making pointer arithmetic easier in C
-    for s in scales:
-        serialize_fp32(out_file, s)
-
-    # print the highest error across all weights, should be very small, e.g. O(~0.001)
-    ew.sort(reverse=True)
-    print(f"max quantization group error across all weights: {ew[0][0]}")
-
-    # write to binary file
-    out_file.close()
-    print(f"wrote {filepath}")
-
-
-# -----------------------------------------------------------------------------
-# Load / import functions
-
-def load_checkpoint(checkpoint):
-
-    # load the provided model checkpoint
-    checkpoint_dict = torch.load(checkpoint, map_location='cpu')
-    gptconf = ModelArgs(**checkpoint_dict['model_args'])
-    model = Transformer(gptconf)
-    state_dict = checkpoint_dict['model']
-    unwanted_prefix = '_orig_mod.'
-    for k,v in list(state_dict.items()):
-        if k.startswith(unwanted_prefix):
-            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
-    model.load_state_dict(state_dict, strict=False)
-    model.eval()
-    return model
-
-def load_meta_model(model_path):
-    params_path = os.path.join(model_path, 'params.json')
-    with open(params_path) as f:
-        params = json.load(f)
-        print(params)
-
-    model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
-    models = [torch.load(p, map_location='cpu') for p in model_paths]
-
-    def concat_weights(models):
-        state_dict = {}
-        for name in list(models[0]):
-            tensors = [model[name] for model in models]
-            if len(tensors) == 1 or len(tensors[0].shape) == 1:
-                state_dict[name] = tensors[0]
-                continue
-            is_axis_1 = (
-                name.startswith('tok_embeddings.')
-                or name.endswith('.attention.wo.weight')
-                or name.endswith('.feed_forward.w2.weight')
-            )
-            axis = 1 if is_axis_1 else 0
-            state_dict[name] = torch.cat(tensors, dim=axis)
-            for model in models:
-                del model[name]
-        return state_dict
-
-    state_dict = concat_weights(models)
-    del models
-
-    # set ModelArgs
-    config = ModelArgs()
-    config.dim = params["dim"]
-    config.n_layers = params["n_layers"]
-    config.n_heads = params["n_heads"]
-    config.n_kv_heads = params.get('n_kv_heads') or params['n_heads']
-    config.multiple_of = params["multiple_of"]
-    config.norm_eps = params["norm_eps"]
-
-    config.vocab_size = state_dict['tok_embeddings.weight'].shape[0]
-    config.max_seq_len = 2048
-
-
-    # create a new Transformer object and set weights
-    model = Transformer(config)
-
-    model.tok_embeddings.weight = nn.Parameter(state_dict['tok_embeddings.weight'])
-    model.norm.weight = nn.Parameter(state_dict['norm.weight'])
-
-    for layer in model.layers:
-        i = layer.layer_id
-        layer.attention_norm.weight = nn.Parameter(state_dict[f'layers.{i}.attention_norm.weight'])
-        layer.attention.wq.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wq.weight'])
-        layer.attention.wk.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wk.weight'])
-        layer.attention.wv.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wv.weight'])
-        layer.attention.wo.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wo.weight'])
-        layer.ffn_norm.weight = nn.Parameter(state_dict[f'layers.{i}.ffn_norm.weight'])
-        layer.feed_forward.w1.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w1.weight'])
-        layer.feed_forward.w2.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w2.weight'])
-        layer.feed_forward.w3.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w3.weight'])
-
-    # final classifier
-    model.output.weight = nn.Parameter(state_dict['output.weight'])
-    model.eval()
-    return model
-
-def load_hf_model(model_path):
-
-    try:
-        from transformers import AutoModelForCausalLM
-    except ImportError:
-        print("Error: transformers package is required to load huggingface models")
-        print("Please run `pip install transformers` to install it")
-        return None
-
-    # load HF model
-    hf_model = AutoModelForCausalLM.from_pretrained(model_path)
-    hf_dict = hf_model.state_dict()
-
-    # convert LlamaConfig to ModelArgs
-    config = ModelArgs()
-    config.dim = hf_model.config.hidden_size
-    config.n_layers = hf_model.config.num_hidden_layers
-    config.n_heads = hf_model.config.num_attention_heads
-    config.n_kv_heads = hf_model.config.num_attention_heads
-    config.vocab_size = hf_model.config.vocab_size
-    config.hidden_dim = hf_model.config.intermediate_size
-    config.norm_eps = hf_model.config.rms_norm_eps
-    config.max_seq_len = hf_model.config.max_position_embeddings
-
-    # create a new Transformer object and set weights
-    model = Transformer(config)
-
-    model.tok_embeddings.weight = nn.Parameter(hf_dict['model.embed_tokens.weight'])
-    model.norm.weight = nn.Parameter(hf_dict['model.norm.weight'])
-
-    # huggingface permutes WQ and WK, this function reverses it
-    def permute_reverse(w, n_heads=config.n_heads, dim1=config.dim, dim2=config.dim):
-        return w.view(n_heads, 2, dim1 // n_heads // 2, dim2).transpose(1, 2).reshape(dim1, dim2)
-
-    for layer in model.layers:
-        i = layer.layer_id
-        layer.attention_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.input_layernorm.weight'])
-        layer.attention.wq.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.q_proj.weight']))
-        layer.attention.wk.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.k_proj.weight']))
-        layer.attention.wv.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.v_proj.weight'])
-        layer.attention.wo.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.o_proj.weight'])
-        layer.ffn_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.post_attention_layernorm.weight'])
-        layer.feed_forward.w1.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.gate_proj.weight'])
-        layer.feed_forward.w2.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.down_proj.weight'])
-        layer.feed_forward.w3.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.up_proj.weight'])
-
-    # final classifier
-    model.output.weight = nn.Parameter(hf_dict['lm_head.weight'])
-    model.eval()
-    return model
-
-
-# -----------------------------------------------------------------------------
-# API entrypoint
-
-def model_export(model, filepath, version):
-    if version == 0:
-        legacy_export(model, filepath)
-    elif version == 1:
-        version1_export(model, filepath)
-    elif version == 2:
-        version2_export(model, filepath)
-    else:
-        raise ValueError(f"unknown version {version}")
-
-def torchscript_export(model, filepath, zero_params=False, gzip_output=False):
-    """
-    (This was submitted via a PR earlier. Leaving it here, but "orphaned" for now)
-    Saves the model as a TorchScript.
-    The resulting file can be loaded in C++ code and then used for training or
-    inference with:
-        #include <torch/script.h>
-        torch::jit::Module module = torch::jit::load("model.pt")
-    Note that the serialized model includes the initial parameters and with the default
-    ModelArgs the file is 59M and gzips down to 55M. If you want to serialize/distribute
-    the model parameters separately you can zero out the parameters before saving it and
-    it will gzip down to 780K.
-    """
-
-    # If requested zero params before saving the model. This is useful in
-    # conjunction with gzip_output.
-    if zero_params:
-        for p in model.parameters():
-            p.detach().zero_()
-
-    torch.jit.save(torch.jit.script(model), filepath)
-
-    if gzip_output:
-        with open(filepath, "rb") as f_in:
-            with gzip.open(f"{filepath}.gz", "wb") as f_out:
-                shutil.copyfileobj(f_in, f_out)
-        os.unlink(filepath)
-
-# -----------------------------------------------------------------------------
-# CLI entrypoint
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("filepath", type=str, help="the output filepath")
-    parser.add_argument("--version", default=0, type=int, help="the version to export with")
-    group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument("--checkpoint", type=str, help="model checkpoint, .pt file")
-    group.add_argument("--meta-llama", type=str, help="meta llama model path")
-    group.add_argument("--hf", type=str, help="huggingface model path")
-    args = parser.parse_args()
-
-    if args.checkpoint:
-        model = load_checkpoint(args.checkpoint)
-    elif args.meta_llama:
-        model = load_meta_model(args.meta_llama)
-    elif args.hf:
-        model = load_hf_model(args.hf)
-
-    if model is None:
-        parser.error("Can't load input model!")
-
-    # export
-    model_export(model, args.filepath, args.version)
@@ -0,0 +1,112 @@
+"""
+This script exports the Llama 2 weights in llama2c.bin format.
+"""
+import os
+import sys
+import struct
+from pathlib import Path
+import json
+
+import torch
+
+from model import precompute_freqs_cis
+
+
+def export(p, state_dict, filepath='model.bin'):
+    """export the model weights in fp32 into .bin file to be read from C"""
+    f = open(filepath, 'wb')
+
+    def serialize(key):
+        print(f"writing {key}...")
+        t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy()
+        f.write(memoryview(t))
+        del state_dict[key]
+
+    # first write out the header
+    hidden_dim = state_dict['layers.0.feed_forward.w1.weight'].shape[0]
+    p['vocab_size'] = 32000
+    p['max_seq_len'] = 2048
+
+    n_kv_heads = p.get('n_kv_heads') or p['n_heads']
+    header = struct.pack(
+        'iiiiiii',
+        p['dim'], hidden_dim, p['n_layers'], p['n_heads'],
+        n_kv_heads, -p['vocab_size'], p['max_seq_len']
+    )
+    # NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present
+    # in the checkpoint and should be loaded.
+    f.write(header)
+
+    # next write out the embedding weights
+    print("writing tok_embeddings...")
+    serialize('tok_embeddings.weight')
+
+    # now all the layers
+    # attention weights
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention_norm.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wq.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wk.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wv.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wo.weight')
+    # ffn weights
+    for i in range(p['n_layers']): serialize(f'layers.{i}.ffn_norm.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w1.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w2.weight')
+    for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w3.weight')
+
+    # final rmsnorm
+    serialize('norm.weight')
+    # freqs_cos, freqs_sin
+    freqs_cos, freqs_sin = precompute_freqs_cis(p['dim'] // p['n_heads'], p['max_seq_len'] * 2)
+    state_dict['freqs_cos'] = freqs_cos[:p['max_seq_len']]
+    state_dict['freqs_sin'] = freqs_sin[:p['max_seq_len']]
+    serialize('freqs_cos')
+    serialize('freqs_sin')
+
+    # finally write the output weights
+    serialize('output.weight')
+
+    f.close()
+    print(f"wrote {filepath}")
+
+
+def concat_weights(models):
+    state_dict = {}
+    for name in list(models[0]):
+        tensors = [model[name] for model in models]
+        if len(tensors) == 1 or len(tensors[0].shape) == 1:
+            state_dict[name] = tensors[0]
+            continue
+        is_axis_1 = (
+            name.startswith('tok_embeddings.')
+            or name.endswith('.attention.wo.weight')
+            or name.endswith('.feed_forward.w2.weight')
+        )
+        axis = 1 if is_axis_1 else 0
+        state_dict[name] = torch.cat(tensors, dim=axis)
+        for model in models:
+            del model[name]
+    return state_dict
+
+
+def load_and_export(model_path, output_path):
+    params_path = os.path.join(model_path, 'params.json')
+    with open(params_path) as f:
+        params = json.load(f)
+        print(params)
+
+    model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
+    models = [torch.load(p, map_location='cpu') for p in model_paths]
+    state_dict = concat_weights(models)
+    del models
+    export(params, state_dict, output_path)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        print('[Llama model folder path] [output path]')
+        exit()
+
+    model_path = sys.argv[1]
+    output_path = sys.argv[2]
+    load_and_export(model_path, output_path)
@@ -11,14 +11,12 @@ from torch import nn

@dataclass
 class ModelArgs:
-    # default hyperparameters for the Llama 7B model
    dim: int = 4096
    n_layers: int = 32
    n_heads: int = 32
    n_kv_heads: Optional[int] = None
-    vocab_size: int = 32000
-    hidden_dim: Optional[int] = None
-    multiple_of: int = 256  # MLP hidden layer size will be multiple of
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
    norm_eps: float = 1e-5
    max_seq_len: int = 2048
    dropout: float = 0.0
@@ -95,7 +93,6 @@ class Attention(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
-        assert args.n_heads % self.n_kv_heads == 0
        model_parallel_size = 1
        self.n_local_heads = args.n_heads // model_parallel_size
        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
@@ -167,10 +164,8 @@ class Attention(nn.Module):
 class FeedForward(nn.Module):
    def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
        super().__init__()
-        if hidden_dim is None:
-            hidden_dim = 4 * dim
-            hidden_dim = int(2 * hidden_dim / 3)
-            hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
@@ -189,7 +184,7 @@ class TransformerBlock(nn.Module):
        self.attention = Attention(args)
        self.feed_forward = FeedForward(
            dim=args.dim,
-            hidden_dim=args.hidden_dim,
+            hidden_dim=4 * args.dim,
            multiple_of=args.multiple_of,
            dropout=args.dropout,
        )
@@ -341,3 +336,55 @@ class Transformer(nn.Module):
            idx = torch.cat((idx, idx_next), dim=1)

        return idx
+
+    def export(self, filepath='model.bin'):
+        """export the model weights in fp32 into .bin file to be read from C"""
+        f = open(filepath, 'wb')
+
+        def serialize(t):
+            d = t.detach().cpu().view(-1).numpy().astype(np.float32)
+            b = struct.pack(f'{len(d)}f', *d)
+            f.write(b)
+
+        # first write out the header
+        hidden_dim = self.layers[0].feed_forward.w1.weight.shape[0]
+        p = self.params
+        n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads
+        header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,
+                                       n_kv_heads, p.vocab_size, p.max_seq_len)
+        f.write(header)
+
+        # next write out the embedding weights
+        serialize(self.tok_embeddings.weight)
+
+        # now all the layers
+        # attention weights
+        for layer in self.layers:
+            serialize(layer.attention_norm.weight)
+        for layer in self.layers:
+            serialize(layer.attention.wq.weight)
+        for layer in self.layers:
+            serialize(layer.attention.wk.weight)
+        for layer in self.layers:
+            serialize(layer.attention.wv.weight)
+        for layer in self.layers:
+            serialize(layer.attention.wo.weight)
+        # ffn weights
+        for layer in self.layers:
+            serialize(layer.ffn_norm.weight)
+        for layer in self.layers:
+            serialize(layer.feed_forward.w1.weight)
+        for layer in self.layers:
+            serialize(layer.feed_forward.w2.weight)
+        for layer in self.layers:
+            serialize(layer.feed_forward.w3.weight)
+        # final rmsnorm
+        serialize(self.norm.weight)
+        # note: no need to write final classifier weights due to weight sharing
+        # freqs_cis
+        serialize(self.freqs_cos[:p.max_seq_len])
+        serialize(self.freqs_sin[:p.max_seq_len])
+
+        # write to binary file
+        f.close()
+        print(f"wrote {filepath}")
@@ -2,6 +2,7 @@ numpy==1.23.5
 pytest==7.4.0
 Requests==2.31.0
 sentencepiece==0.1.99
+tiktoken==0.3.3
 torch==2.0.1
 tqdm==4.64.1
 wandb==0.15.5
@@ -89,27 +89,6 @@
        "cmd = f'./run {model_file} -t {temperature} -p {top_p} -n {max_token} -i \"{prompt}\"'\n",
        "!{cmd}"
      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "#@title Run Meta's Llama 2 models\n",
-        "\n",
-        "#@markdown input your huggingface [access token](https://huggingface.co/settings/tokens) to download Meta's Llama 2 models.\n",
-        "\n",
-        "from huggingface_hub import snapshot_download\n",
-        "\n",
-        "token = \"replace your huggingface access token\" #@param {type:\"string\"}\n",
-        "path = snapshot_download(repo_id=\"meta-llama/Llama-2-7b\",cache_dir=\"Llama-2-7b\", use_auth_token=token)\n",
-        "\n",
-        "!python export_meta_llama_bin.py $path llama2_7b.bin\n",
-        "\n",
-        "print(\"./run llama2_7b.bin\\n\")\n",
-        "!./run llama2_7b.bin"
-      ]
    }
  ],
  "metadata": {
@@ -5,19 +5,17 @@ import os
 import pickle
 from contextlib import nullcontext
 import torch
+import tiktoken
 from model import ModelArgs, Transformer
 from tokenizer import Tokenizer

-from tinystories import get_tokenizer_model_path
-
 # -----------------------------------------------------------------------------
-checkpoint = 'out/ckpt.pt'
+out_dir = 'out' # ignored if init_from is not 'resume'
 start = "" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
 num_samples = 1 # number of samples to draw
 max_new_tokens = 100 # number of tokens generated in each sample
 temperature = 1.0 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
 top_k = 300 # retain only the top_k most likely tokens, clamp others to have 0 probability
-tokenizer = "" # override the tokenizer model path
 seed = 1337
 device = 'cuda' if torch.cuda.is_available() else 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
 #dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
@@ -35,10 +33,11 @@ ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torc
 ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

 # init from a model saved in a specific directory
-checkpoint_dict = torch.load(checkpoint, map_location=device)
-gptconf = ModelArgs(**checkpoint_dict['model_args'])
+ckpt_path = os.path.join(out_dir, 'ckpt.pt')
+checkpoint = torch.load(ckpt_path, map_location=device)
+gptconf = ModelArgs(**checkpoint['model_args'])
 model = Transformer(gptconf)
-state_dict = checkpoint_dict['model']
+state_dict = checkpoint['model']
 unwanted_prefix = '_orig_mod.'
 for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
@@ -52,16 +51,7 @@ if compile:
    model = torch.compile(model) # requires PyTorch 2.0 (optional)

 # load the tokenizer
-vocab_source = checkpoint_dict["config"].get("vocab_source", "llama2")
-vocab_size = gptconf.vocab_size
-if tokenizer:
-    # a specific tokenizer is provided, use it
-    tokenizer_model = tokenizer
-else:
-    # let's try to find the tokenizer model automatically. bit gross here...
-    query_vocab_size = 0 if vocab_source == "llama2" else vocab_size
-    tokenizer_model = get_tokenizer_model_path(vocab_size=query_vocab_size)
-enc = Tokenizer(tokenizer_model=tokenizer_model)
+enc = Tokenizer()

 # encode the beginning of the prompt
 if start.startswith('FILE:'):
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+"""Saves the model as a TorchScript.
+
+Usage examples:
+    ./save_torchscript.py
+    ./save_torchscript.py --dim=300
+    ./save_torchscript.py --gzip_output=True --zero_params=True
+
+The resulting file can be loaded in C++ code and then used for training or
+inference with:
+    #include <torch/script.h>
+    torch::jit::Module module = torch::jit::load("model.pt")
+
+Note that the serialized model includes the initial parameters and with the default
+ModelArgs the file is 59M and gzips down to 55M. If you want to serialize/distribute
+the model parameters separately you can zero out the parameters before saving it and
+it will gzip down to 780K.
+"""
+import gzip
+import os
+import shutil
+from inspect import signature
+
+import torch
+
+from model import ModelArgs, Transformer
+
+# Model args config
+dim = 288
+n_layers = 6
+n_heads = 6
+n_kv_heads = n_heads
+multiple_of = 32
+max_seq_len = 256
+dropout = 0.0
+vocab_size = 32000
+norm_eps = 1e-5
+# Save config
+model_path = "model.pt"
+zero_params = False
+gzip_output = False
+# Allow config overrides
+exec(open("configurator.py").read())
+
+
+def main() -> None:
+    model_args = {k: globals()[k] for k in signature(ModelArgs).parameters}
+    model = Transformer(ModelArgs(**model_args))
+
+    # If requested zero params before saving the model. This is useful in
+    # conjunction with gzip_output.
+    if zero_params:
+        for p in model.parameters():
+            p.detach().zero_()
+
+    torch.jit.save(torch.jit.script(model), model_path)
+
+    if gzip_output:
+        with open(model_path, "rb") as f_in:
+            with gzip.open(f"{model_path}.gz", "wb") as f_out:
+                shutil.copyfileobj(f_in, f_out)
+        os.unlink(model_path)
+
+
+if __name__ == "__main__":
+    main()
@@ -1,84 +0,0 @@
-#define TESTING
-#include "run.c"
-
-void assert_eq(int a, int b) {
-    if (a != b) {
-        printf("Assertion failed: %d != %d\n", a, b);
-        exit(EXIT_FAILURE);
-    }
-}
-
-void test_prompt_encoding(Tokenizer* tokenizer, char* prompt, int* expected_tokens, int num_expected_tokens) {
-    // encode
-    int* prompt_tokens = (int*)malloc((strlen(prompt)+3) * sizeof(int));
-    int num_prompt_tokens = 0; // the total number of prompt tokens
-    encode(tokenizer, prompt, 1, 0, prompt_tokens, &num_prompt_tokens);
-
-    #if VERBOSITY == 1
-    // print maybe
-    printf("expected tokens:\n");
-    for (int i = 0; i < num_expected_tokens; i++) printf("%d ", expected_tokens[i]);
-    printf("\n");
-    printf("actual tokens:\n");
-    for (int i = 0; i < num_prompt_tokens; i++) printf("%d ", prompt_tokens[i]);
-    printf("\n");
-    #endif
-
-    // verify
-    assert_eq(num_prompt_tokens, num_expected_tokens);
-    for (int i = 0; i < num_prompt_tokens; i++) {
-        assert_eq(prompt_tokens[i], expected_tokens[i]);
-    }
-
-    #if VERBOSITY == 1
-    printf("OK\n");
-    printf("---\n");
-    #endif
-    free(prompt_tokens);
-}
-
-void test_prompt_encodings() {
-    // let's verify that the Tokenizer works as expected
-
-    char *tokenizer_path = "tokenizer.bin";
-    int vocab_size = 32000;
-    Tokenizer tokenizer;
-    build_tokenizer(&tokenizer, tokenizer_path, vocab_size);
-
-    // test 0 (test the empty string) (I added this as a simple case)
-    char *prompt0 = "";
-    int expected_tokens0[] = {1};
-    test_prompt_encoding(&tokenizer, prompt0, expected_tokens0, sizeof(expected_tokens0) / sizeof(int));
-
-    // the tests below are taken from the Meta Llama 2 repo example code
-    // https://github.com/facebookresearch/llama/blob/main/example_text_completion.py
-    // and the expected tokens come from me breaking in the debugger in Python
-
-    // test 1
-    char *prompt = "I believe the meaning of life is";
-    int expected_tokens[] = {1, 306, 4658, 278, 6593, 310, 2834, 338};
-    test_prompt_encoding(&tokenizer, prompt, expected_tokens, sizeof(expected_tokens) / sizeof(int));
-
-    // test 2
-    char* prompt2 = "Simply put, the theory of relativity states that ";
-    int expected_tokens2[] = {1, 3439, 17632, 1925, 29892, 278, 6368, 310, 14215, 537, 5922, 393, 29871};
-    test_prompt_encoding(&tokenizer, prompt2, expected_tokens2, sizeof(expected_tokens2) / sizeof(int));
-
-    // test 3
-    char* prompt3 = "A brief message congratulating the team on the launch:\n\n        Hi everyone,\n\n        I just ";
-    int expected_tokens3[] = {1, 319, 11473, 2643, 378, 629, 271, 18099, 278, 3815, 373, 278, 6826, 29901, 13, 13, 4706, 6324, 14332, 29892, 13, 13, 4706, 306, 925, 29871};
-    test_prompt_encoding(&tokenizer, prompt3, expected_tokens3, sizeof(expected_tokens3) / sizeof(int));
-
-    // test 4
-    char* prompt4 = "Translate English to French:\n\n        sea otter => loutre de mer\n        peppermint => menthe poivrée\n        plush girafe => girafe peluche\n        cheese =>";
-    int expected_tokens4[] = {1, 4103, 9632, 4223, 304, 5176, 29901, 13, 13, 4706, 7205, 4932, 357, 1149, 301, 449, 276, 316, 2778, 13, 4706, 1236, 407, 837, 524, 1149, 6042, 354, 772, 440, 29878, 1318, 13, 4706, 715, 1878, 330, 3055, 1725, 1149, 330, 3055, 1725, 4639, 28754, 13, 4706, 923, 968, 1149};
-    test_prompt_encoding(&tokenizer, prompt4, expected_tokens4, sizeof(expected_tokens4) / sizeof(int));
-
-    // memory and file handles cleanup
-    free_tokenizer(&tokenizer);
-}
-
-int main(int argc, char *argv[]) {
-    test_prompt_encodings();
-    printf("ALL OK\n");
-}
@@ -4,71 +4,37 @@ $ pytest
 """
 import os
 import pytest # pip install pytest
-import requests
 import subprocess

-
 import torch
 from model import ModelArgs, Transformer
-from tokenizer import Tokenizer

-# -----------------------------------------------------------------------------
-# test utilities
+def test_argmax_inference():
+    """
+    Only the simplest test for now: run inference with temperature 0 
+    (for determinism) in both C and PyTorch, and see that the sampled tokens 
+    are the same.
+    """
+    test_ckpt_dir = "out" # TODO create a dummy test checkpoint for this?

-test_ckpt_dir = "test"
+    # run C version
+    model_path = os.path.join(test_ckpt_dir, "model.bin")
+    command = ["./run", model_path, "0.0"]
+    proc = subprocess.Popen(command, stdout=subprocess.PIPE)
+    c_tokens = []
+    for line in proc.stdout:
+        token = int(line.decode('utf-8').strip())
+        c_tokens.append(token)
+    proc.wait()
+    #print(c_tokens)

-def download_file(url, filename):
-    print(f"Downloading {url} to {filename}")
-    response = requests.get(url, stream=True)
-    response.raise_for_status() # Raise an HTTPError on bad status code
-    with open(filename, 'wb') as file:
-        for chunk in response.iter_content(chunk_size=8192):
-            file.write(chunk)
-
-def attempt_download_files():
-    os.makedirs(test_ckpt_dir, exist_ok=True)
-    root_url = "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K"
-    need = ["stories260K.bin", "stories260K.pt", "tok512.bin", "tok512.model"]
-    for file in need:
-        url = root_url + '/' + file   #os.path.join inserts \\ on windows
-        filename = os.path.join(test_ckpt_dir, file)
-        if not os.path.exists(filename):
-            download_file(url, filename)
-
-expected_stdout = b'Once upon a time, there was a little girl named Lily. She loved to play outside in the park. One day, she saw a big, red ball. She wanted to play with it, but it was too high.\nLily\'s mom said, "Lily, let\'s go to the park." Lily was sad and didn\'t know what to do. She said, "I want to play with your ball, but I can\'t find it."\nLily was sad and didn\'t know what to do. She said, "I\'m sorry, Lily. I didn\'t know what to do."\nLily didn\'t want to help her mom, so she'
-
-# -----------------------------------------------------------------------------
-# actual tests
-
-def test_runc():
-    """ Forwards a model against a known-good desired outcome in run.c for 200 steps"""
-    attempt_download_files()
-
-    model_path = os.path.join(test_ckpt_dir, "stories260K.bin")
-    tokenizer_path = os.path.join(test_ckpt_dir, "tok512.bin")
-    command = ["./run", model_path, "-z", tokenizer_path, "-t", "0.0", "-n", "200"]
-    with open('err.txt', mode='wb') as fe:
-        with open('stdout.txt', mode='wb') as fo:
-            proc = subprocess.Popen(command, stdout=fo, stderr=fe)  #pipe in windows terminal does funny things like replacing \n with \r\n
-            proc.wait()
-
-    with open('stdout.txt', mode='r') as f:
-        stdout = f.read()
-    # strip the very last \n that is added by run.c for aesthetic reasons
-    stdout = stdout[:-1].encode('ascii')
-
-    assert stdout == expected_stdout
-
-def test_python():
-    """ Forwards a model against a known-good desired outcome in sample.py for 200 steps"""
-    attempt_download_files()
-
-    device = "cpu" # stories260K is small enough to just breeze through it on CPU
-    checkpoint = os.path.join(test_ckpt_dir, "stories260K.pt")
-    checkpoint_dict = torch.load(checkpoint, map_location=device)
-    gptconf = ModelArgs(**checkpoint_dict['model_args'])
+    # run PyTorch version
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    ckpt_path = os.path.join(test_ckpt_dir, "ckpt.pt")
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    gptconf = ModelArgs(**checkpoint['model_args'])
    model = Transformer(gptconf)
-    state_dict = checkpoint_dict['model']
+    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
@@ -78,12 +44,10 @@ def test_python():
    model.to(device)
    x = torch.tensor([[1]], dtype=torch.long, device=device) # 1 is BOS
    with torch.inference_mode():
-        y = model.generate(x, max_new_tokens=200, temperature=0.0)
+        y = model.generate(x, max_new_tokens=gptconf.max_seq_len, temperature=0.0)
    pt_tokens = y[0].tolist()
+    pt_tokens = pt_tokens[1:] # remove BOS
+    #print(pt_tokens)

-    tokenizer_model = os.path.join(test_ckpt_dir, "tok512.model")
-    enc = Tokenizer(tokenizer_model=tokenizer_model)
-    text = enc.decode(pt_tokens)
-    text = text.encode('ascii') # turn into bytes
-
-    assert text == expected_stdout
+    # compare
+    assert c_tokens == pt_tokens
@@ -0,0 +1,140 @@
+"""
+Download, preprocess and serve the TinyShakespeare dataset as a DataLoader.
+
+Follows the same interface as the TinyStories dataset.
+"""
+
+import argparse
+import os
+import random
+
+import numpy as np
+import requests
+import torch
+import torch.distributed as dist
+from tqdm import tqdm
+
+from tokenizer import Tokenizer
+
+DATA_CACHE_DIR = "data"
+
+def download_file(url: str, fname: str, chunk_size=1024):
+    """Helper function to download a file from a given url"""
+    resp = requests.get(url, stream=True)
+    total = int(resp.headers.get("content-length", 0))
+    with open(fname, "wb") as file, tqdm(
+        desc=fname,
+        total=total,
+        unit="iB",
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for data in resp.iter_content(chunk_size=chunk_size):
+            size = file.write(data)
+            bar.update(size)
+
+
+def download():
+    """Downloads the dataset to disk."""
+    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
+
+    # download the TinyShakespeare dataset, unless it's already downloaded
+    data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
+    data_filename = os.path.join(DATA_CACHE_DIR, "tinyshakespeare.txt")
+    if not os.path.exists(data_filename):
+        print(f"Downloading {data_url} to {data_filename}...")
+        download_file(data_url, data_filename)
+    else:
+        print(f"{data_filename} already exists, skipping download...")
+
+    print("Download done.")
+
+def pretokenize():
+    enc = Tokenizer()
+
+    data_file = os.path.join(DATA_CACHE_DIR, "tinyshakespeare.txt")
+
+    all_tokens = []
+    with open(data_file, "r") as f:
+        for line in f:
+            text = line.strip()
+            tokens = enc.encode(text, bos=True, eos=False)
+            all_tokens.extend(tokens)
+    all_tokens = np.array(all_tokens, dtype=np.uint16)
+    print(f"Total tokens: {len(all_tokens)}")
+    with open(data_file.replace(".txt", ".bin"), "wb") as f:
+        f.write(all_tokens.tobytes())
+    print(f"Saved {data_file.replace('.txt', '.bin')}")
+    print("Done.")
+
+
+class PretokDataset(torch.utils.data.IterableDataset):
+    """Loads pretokenized examples from disk and yields them as PyTorch tensors."""
+
+    def __init__(self, split, max_seq_len):
+        super().__init__()
+        self.split = split
+        self.max_seq_len = max_seq_len
+
+    def __iter__(self):
+        # get worker info within a DataLoader
+        worker_info = torch.utils.data.get_worker_info()
+        worker_id = worker_info.id if worker_info else 0
+        # get DDP rank info
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        # combine the worker_id and worker_rank to create a unique seed for rng
+        seed = 42 + worker_id + 1337 * rank
+        rng = random.Random(seed)
+        print(f"Created a PretokDataset with rng seed {seed}")
+        data_file = os.path.join(DATA_CACHE_DIR, "tinyshakespeare.bin")
+        m_all = np.memmap(data_file, dtype=np.uint16, mode="r")
+
+        # split out 10% of the data for validation
+        split_ix = int(len(m_all) * 0.9)
+        if self.split == "train":
+            m = m_all[:split_ix]
+        else:
+            m = m_all[split_ix:]
+
+        num_batches = len(m) // self.max_seq_len
+        num_batches -= 1  # drop the last partial batch
+        assert num_batches > 0, "this split is way too small? investigate."
+
+        while True:
+            ixs = list(range(num_batches))
+            rng.shuffle(ixs)
+            for ix in ixs:
+                start = ix * self.max_seq_len
+                end = start + self.max_seq_len + 1
+                # calling .astype will copy the data into a new numpy array, now in RAM
+                chunk = torch.from_numpy((m[start:end]).astype(np.int64))
+                x = chunk[:-1]
+                y = chunk[1:]
+                yield x, y
+
+
+class ShakespeareTask:
+
+    @staticmethod
+    def iter_batches(split, batch_size, max_seq_len, device, num_workers=0):
+        ds = PretokDataset(split, max_seq_len)
+        dl = torch.utils.data.DataLoader(
+            ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers
+        )
+        for x, y in dl:
+            x = x.to(device, non_blocking=True)
+            y = y.to(device, non_blocking=True)
+            yield x, y
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"])
+    args = parser.parse_args()
+
+    # depending on the stage call the appropriate function
+    fun = {
+        "download": download,
+        "pretokenize": pretokenize,
+    }
+    fun[args.stage]()
@@ -9,11 +9,9 @@ import os
 import random
 from typing import List
 from concurrent.futures import ProcessPoolExecutor
-from functools import partial

 import numpy as np
 import requests
-import sentencepiece as spm
 import torch
 import torch.distributed as dist
 from tqdm import tqdm
@@ -39,7 +37,7 @@ def download_file(url: str, fname: str, chunk_size=1024):


 def download():
-    """Downloads the TinyStories dataset to DATA_CACHE_DIR"""
+    """Downloads the dataset to disk."""
    os.makedirs(DATA_CACHE_DIR, exist_ok=True)

    # download the TinyStories dataset, unless it's already downloaded
@@ -68,66 +66,10 @@ def download():
    print(f"Number of shards: {len(shard_filenames)}")
    print(f"Example story:\n{data[0]}")

-def train_vocab(vocab_size):
-    """
-    Trains a custom sentencepiece tokenizer on the TinyStories dataset.
-    The custom tokenizer files will be saved in DATA_CACHE_DIR/tok{N} directories,
-    where N is the vocab size. This is also where the pretok .bin files will go.
-    """
-    assert vocab_size > 0, "Vocab size must be positive"

-    # output file prefix path for sentencepiece
-    prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
-
-    # how many shards we'll use for vocab training, kept low for efficiency
-    num_shards = 10
-
-    # 1) export a large chunk of text as a single text file tiny.txt
-    tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt")
-    data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
-    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
-
-    print(f"Writing temporary file {tiny_file} with {num_shards} shards...")
-    with open(tiny_file, "w") as of:
-        for shard in tqdm(shard_filenames[:num_shards]):
-            with open(shard, "r") as f:
-                data = json.load(f)
-            for example in data:
-                text = example["story"]
-                text = text.strip()
-                of.write(text + "\n")
-    print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")
-
-    # 2) train the sentencepiece model
-    print("Will now train the vocab...")
-    spm.SentencePieceTrainer.train(input=tiny_file,
-                                   model_prefix=prefix,
-                                   model_type="bpe",
-                                   vocab_size=vocab_size,
-                                   self_test_sample_size=0,
-                                   input_format="text",
-                                   character_coverage=1.0,
-                                   num_threads=os.cpu_count(),
-                                   split_digits=True,
-                                   allow_whitespace_only_pieces=True,
-                                   byte_fallback=True,
-                                   unk_surface=r" \342\201\207 ",
-                                   normalization_rule_name="identity")
-
-    # 3) optional cleanup, ask the user if they'd like to delete tiny.txt
-    dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")
-    if dec.lower() == "y":
-        os.remove(tiny_file)
-        print(f"Deleted {tiny_file}")
-
-    print(f"Trained tokenizer is in {prefix}.model")
-    print("Done.")
-
-
-def process_shard(args, vocab_size):
+def process_shard(args):
    shard_id, shard = args
-    tokenizer_model = get_tokenizer_model_path(vocab_size)
-    enc = Tokenizer(tokenizer_model)
+    enc = Tokenizer()
    with open(shard, "r") as f:
        data = json.load(f)
    all_tokens = []
@@ -138,49 +80,31 @@ def process_shard(args, vocab_size):
        all_tokens.extend(tokens)
    # convert to uint16 nparray
    all_tokens = np.array(all_tokens, dtype=np.uint16)
-    # calculate the output filename
-    if vocab_size == 0:
-        # if we're using Llama 2, just save the tokenized file in the same dir
-        tokenized_filename = shard.replace(".json", ".bin")
-    else:
-        # save .bin files into a new tok{N} directory
-        bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
-        shard_basename = os.path.basename(shard)
-        bin_basename = shard_basename.replace(".json", ".bin")
-        tokenized_filename = os.path.join(bin_dir, bin_basename)
-    # write the bytes
+    # write to disk
+    tokenized_filename = shard.replace(".json", ".bin")
    with open(tokenized_filename, "wb") as f:
        f.write(all_tokens.tobytes())
-    # calculate the average sequence length (they are separated by BOS=1)
-    avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
-    print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}")
+    print(f"Saved {tokenized_filename}")


-def pretokenize(vocab_size):
+def pretokenize():
    # iterate the shards and tokenize all of them one by one
    data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
-    if vocab_size > 0:
-        # .bin files will be saved into tok{N} directory, create it once here
-        bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
-        os.makedirs(bin_dir, exist_ok=True)

    # process all the shards in a process pool
-    fun = partial(process_shard, vocab_size=vocab_size)
    with ProcessPoolExecutor() as executor:
-        executor.map(fun, enumerate(shard_filenames))
+        executor.map(process_shard, enumerate(shard_filenames))
    print("Done.")


 class PretokDataset(torch.utils.data.IterableDataset):
    """Loads pretokenized examples from disk and yields them as PyTorch tensors."""

-    def __init__(self, split, max_seq_len, vocab_size, vocab_source):
+    def __init__(self, split, max_seq_len):
        super().__init__()
        self.split = split
        self.max_seq_len = max_seq_len
-        self.vocab_size = vocab_size
-        self.vocab_source = vocab_source

    def __iter__(self):
        # get worker info within a DataLoader
@@ -192,17 +116,10 @@ class PretokDataset(torch.utils.data.IterableDataset):
        seed = 42 + worker_id + 1337 * rank
        rng = random.Random(seed)
        print(f"Created a PretokDataset with rng seed {seed}")
-        if self.vocab_source == "llama2":
-            # the .bin files are right along the .json files
-            bin_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
-            shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
-        elif self.vocab_source == "custom":
-            # the .bin files are in tok{N} directory
-            bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{self.vocab_size}")
-            shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
+        data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
+        shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.bin")))
        # train/test split. let's use only shard 0 for test split, rest train
        shard_filenames = shard_filenames[1:] if self.split == "train" else shard_filenames[:1]
-        assert len(shard_filenames)>0, f"No bin files found in {bin_dir}"
        while True:
            rng.shuffle(shard_filenames)
            for shard in shard_filenames:
@@ -222,25 +139,12 @@ class PretokDataset(torch.utils.data.IterableDataset):
                    y = chunk[1:]
                    yield x, y

-# -----------------------------------------------------------------------------
-# public interface functions
-
-def get_tokenizer_model_path(vocab_size):
-    """
-    Returns path to the sentencepiece tokenizer model for a given vocab size
-    vocab_size = 0 designates the default Llama 2 tokenizer, in that case
-    None is returned.
-    """
-    if vocab_size == 0:
-        return None
-    else:
-        return os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model")

 class Task:

    @staticmethod
-    def iter_batches(batch_size, device, num_workers=0, **dataset_kwargs):
-        ds = PretokDataset(**dataset_kwargs)
+    def iter_batches(split, batch_size, max_seq_len, device, num_workers=0):
+        ds = PretokDataset(split, max_seq_len)
        dl = torch.utils.data.DataLoader(
            ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers
        )
@@ -249,33 +153,16 @@ class Task:
            y = y.to(device, non_blocking=True)
            yield x, y

-# -----------------------------------------------------------------------------
-# CLI for constructing the dataset

 if __name__ == "__main__":
-    """
-    These stages are designed to be run in order.
-
-    To tokenize data with the Llama 2 tokenizer:
-    python tinystories.py download
-    python tinystories.py pretokenize
-
-    To tokenize data with a custom tokenizer we train ourselves with sentencepiece, e.g.:
-    python tinystories.py download
-    python tinystories.py train_vocab --vocab_size=2048
-    python tinystories.py pretokenize --vocab_size=2048
-    """
    parser = argparse.ArgumentParser()
-    parser.add_argument("stage", type=str, choices=["download", "pretokenize", "train_vocab"])
-    parser.add_argument("--vocab_size", type=int, default=0, help="pretokenization vocab size. 0 = use Llama 2 tokenizer.")
+    parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"])
    args = parser.parse_args()

    # depending on the stage call the appropriate function
-    if args.stage == "download":
-        download()
-    elif args.stage == "train_vocab":
-        train_vocab(vocab_size=args.vocab_size)
-    elif args.stage == "pretokenize":
-        pretokenize(vocab_size=args.vocab_size)
-    else:
-        raise ValueError(f"Unknown stage {args.stage}")
+    fun = {
+        "download": download,
+        "pretokenize": pretokenize,
+    }
+    fun[args.stage]()
+
@@ -4,19 +4,20 @@

 import os
 import struct
-import argparse
+from logging import getLogger
 from typing import List

 from sentencepiece import SentencePieceProcessor

 TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
+TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C

 class Tokenizer:
-    def __init__(self, tokenizer_model=None):
-        model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
+    def __init__(self):
+        model_path = TOKENIZER_MODEL
        assert os.path.isfile(model_path), model_path
        self.sp_model = SentencePieceProcessor(model_file=model_path)
-        self.model_path = model_path
+        #print(f"Loaded SentencePiece model from {model_path}")

        # BOS / EOS token IDs
        self.n_words: int = self.sp_model.vocab_size()
@@ -51,28 +52,24 @@ class Tokenizer:
                t = '\n<s>\n'
            elif i == self.eos_id:
                t = '\n</s>\n'
+            elif len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
+                t = chr(int(t[3:5], 16)) # e.g. make '<0x01>' into '\x01'
            t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
            b = t.encode('utf-8') # bytes of this token, utf-8 encoded

            tokens.append(b)
            scores.append(s)
-
+        
        # record the max token length
        max_token_length = max(len(t) for t in tokens)

        # write to a binary file
-        # the tokenizer.bin file is the same as .model file, but .bin
-        tokenizer_bin = self.model_path.replace('.model', '.bin')
-        with open(tokenizer_bin, 'wb') as f:
+        with open(TOKENIZER_BIN, 'wb') as f:
            f.write(struct.pack("I", max_token_length))
            for bytes, score in zip(tokens, scores):
                f.write(struct.pack("fI", score, len(bytes)))
                f.write(bytes)

 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-t", "--tokenizer-model", type=str, help="optional path to custom tokenizer ")
-    args = parser.parse_args()
-
-    t = Tokenizer(args.tokenizer_model)
+    t = Tokenizer()
    t.export()
@@ -29,7 +29,7 @@ from torch.distributed import destroy_process_group, init_process_group
 from torch.nn.parallel import DistributedDataParallel as DDP

 from tinystories import Task
-from export import model_export
+from tinyshakespeare import ShakespeareTask

 # -----------------------------------------------------------------------------
 # I/O
@@ -47,13 +47,11 @@ wandb_run_name = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
 # data
 batch_size = 128  # if gradient_accumulation_steps > 1, this is the micro-batch size
 max_seq_len = 256
-vocab_source = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
-vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens
+dataset = "tinystories"  # tinystories|tinyshakespeare
 # model
 dim = 288
 n_layers = 6
 n_heads = 6
-n_kv_heads = 6
 multiple_of = 32
 dropout = 0.0
 # adamw optimizer
@@ -85,10 +83,6 @@ config = {k: globals()[k] for k in config_keys}  # will be useful for logging
 lr_decay_iters = max_iters  # should be ~= max_iters per Chinchilla
 min_lr = 0.0  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla

-# validating checks
-assert vocab_source in ["llama2", "custom"]
-assert vocab_source == "custom" or vocab_size == 32000, "The vocab from Meta has 32K tokens"
-
 # various inits, derived attributes, I/O setup
 ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
 if ddp:
@@ -129,12 +123,11 @@ ctx = (
 )

 # task-specific setup
+task = {'tinystories': Task, 'tinyshakespeare': ShakespeareTask}[dataset]
 iter_batches = partial(
-    Task.iter_batches,
+    task.iter_batches,
    batch_size=batch_size,
    max_seq_len=max_seq_len,
-    vocab_size=vocab_size,
-    vocab_source=vocab_source,
    device=device,
    num_workers=0,
 )
@@ -148,8 +141,8 @@ model_args = dict(
    dim=dim,
    n_layers=n_layers,
    n_heads=n_heads,
-    n_kv_heads=n_kv_heads,
-    vocab_size=vocab_size,
+    n_kv_heads=n_heads,
+    vocab_size=32000,
    multiple_of=multiple_of,
    max_seq_len=max_seq_len,
    dropout=dropout,
@@ -213,7 +206,7 @@ def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "val"]:
-        batch_iter = iter_batches(split=split)
+        batch_iter = iter_batches(split)
        losses = torch.zeros(eval_iters)  # keep on CPU
        for k in range(eval_iters):
            X, Y = next(batch_iter)
@@ -245,7 +238,7 @@ if wandb_log and master_process:
    wandb.init(project=wandb_project, name=wandb_run_name, config=config)

 # training loop
-train_batch_iter = iter_batches(split="train")
+train_batch_iter = iter_batches("train")
 X, Y = next(train_batch_iter)  # fetch the very first batch
 t0 = time.time()
 local_iter_num = 0  # number of iterations in the lifetime of this process
@@ -271,7 +264,7 @@ while True:
                        "loss/val": losses["val"],
                        "lr": lr,
                        "mfu": running_mfu * 100,  # convert to percentage
-                    }, step = iter_num
+                    }
                )
            except Exception as e:
                print(f"logging to wandb failed: {e}")
@@ -288,7 +281,7 @@ while True:
                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt"))
-                model_export(raw_model, os.path.join(out_dir, "model.bin"), version=0)
+                raw_model.export(os.path.join(out_dir, "model.bin"))
    if iter_num == 0 and eval_only:
        break