1 Commits

Author SHA1 Message Date
Andrej Karpathy d0309ab2d4 add avx2 intrinsics maybe 2023-08-10 15:01:53 +00:00
20 changed files with 801 additions and 1788 deletions
+7 -76
View File
@@ -4,12 +4,10 @@ on:
push:
branches:
- master
paths: ['.github/workflows/**', '**/Makefile', '**/*.c', '**/*.h', '**/*.py']
paths: ['.github/workflows/**', '**/Makefile', '**/*.c', '**/*.h']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/Makefile', '**/*.c', '**/*.h', '**/*.py']
# for manual triggering
workflow_dispatch:
paths: ['**/Makefile', '**/*.c', '**/*.h']
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -17,7 +15,7 @@ env:
jobs:
# check basic builds to avoid breaking changes
ubuntu-focal-make:
runs-on: ubuntu-latest
runs-on: ubuntu-20.04
steps:
- name: Clone
@@ -30,16 +28,6 @@ jobs:
sudo apt-get update
sudo apt-get install build-essential -y
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Pip setup
run: |
python -m pip install --upgrade pip
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Build
id: make_build
run: |
@@ -50,10 +38,6 @@ jobs:
run: |
make runfast
- name: Test with pytest
run: |
pytest
macOS-latest-make:
runs-on: macos-latest
@@ -68,21 +52,6 @@ jobs:
run: |
brew update
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Pip setup
run: |
python -m pip install --upgrade pip
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Build clang
id: make_build_clang
run: |
make run CC=clang
- name: Build
id: make_build
run: |
@@ -93,17 +62,15 @@ jobs:
run: |
make runfast
- name: Test with pytest
run: pytest
- name: Build clang
id: make_build_clang
run: |
make run CC=clang
windows-latest-make:
runs-on: windows-latest
strategy:
fail-fast: false #necessary, otherwise the matrix breaks
matrix:
arch:
- amd64
@@ -123,30 +90,11 @@ jobs:
with:
arch: ${{ matrix.arch }}
- name: Set up Python 3.10
if: matrix.arch != 'amd64_arm64'
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Pip setup
if: matrix.arch != 'amd64_arm64'
run: |
python -m pip install --upgrade pip
if (Test-Path requirements.txt) {
pip install -r requirements.txt
}
- name: Build ${{ matrix.arch }}
id: build_msvc
run: |
.\build_msvc.bat
#cross-comiled, cannot be run on host
- name: Test with pytest
if: matrix.arch != 'amd64_arm64'
run: pytest
windows-latest-mingw:
runs-on: windows-latest
@@ -174,20 +122,3 @@ jobs:
id: build_mingw
run: |
make win64
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Pip setup
shell: powershell
run: |
python -m pip install --upgrade pip
if (Test-Path requirements.txt) {
pip install -r requirements.txt
}
- name: Test with pytest
shell: powershell
run: pytest
+9 -18
View File
@@ -32,6 +32,15 @@ runfast: run.c
runomp: run.c
$(CC) -Ofast -fopenmp -march=native run.c -lm -o run
# compile with AVX2 intrinsics enabled
.PHONY: runavx2
runavx2: run.c
$(CC) -Ofast -march=native -mavx2 -DLLAMAC_AVX2 -o run run.c -lm
.PHONY: runompavx2
runompavx2: run.c
$(CC) -Ofast -fopenmp -march=native -mavx2 -DLLAMAC_AVX2 run.c -lm -o run
.PHONY: win64
win64:
x86_64-w64-mingw32-gcc -Ofast -D_WIN32 -o run.exe -I. run.c win.c
@@ -45,24 +54,6 @@ rungnu:
runompgnu:
$(CC) -Ofast -fopenmp -std=gnu11 run.c -lm -o run
# run all tests
.PHONY: test
test:
pytest
# run only tests for run.c C implementation (is a bit faster if only C code changed)
.PHONY: testc
testc:
pytest -k runc
# run the C tests, without touching pytest / python
# to increase verbosity level run e.g. as `make testcc VERBOSITY=1`
VERBOSITY ?= 0
.PHONY: testcc
testcc:
$(CC) -DVERBOSITY=$(VERBOSITY) -O3 -o testc test.c -lm
./testc
.PHONY: clean
clean:
rm -f run
+28 -130
View File
@@ -4,11 +4,9 @@
<img src="assets/llama_cute.jpg" width="300" height="300" alt="Cute Llama">
</p>
Train the Llama 2 LLM architecture in PyTorch then inference it with one simple 700-line C file ([run.c](run.c)). You might think that you need many billion parameter LLMs to do anything useful, but in fact very small LLMs can have surprisingly strong performance if you make the domain narrow enough (ref: [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) paper). This repo is a "fullstack" train + inference solution for Llama 2 LLM, with focus on minimalism and simplicity.
With the code in this repo you can train the Llama 2 LLM architecture from scratch in PyTorch, then export the weights to a binary file, and load that into one ~simple 500-line C file ([run.c](run.c)) that inferences the model. Alternatively, you can load, finetune, and inference Meta's Llama 2 (but this is still being actively fleshed out). Hence, this repo is a "fullstack" train + inference solution for Llama 2 LLM, with a focus on minimalism and simplicity. You might think that you need many billion parameter LLMs to do anything useful, but in fact very small LLMs can have surprisingly strong performance if you make the domain narrow enough. I recommend looking at the [TinyStories](https://huggingface.co/datasets/roneneldan/TinyStories) paper for inspiration.
As the architecture is identical, you can also load and inference Meta's Llama 2 models. However, the current code only inferences models in fp32, so you will most likely not be able to productively load models larger than 7B. Work on model quantization is currently ongoing.
Please note that this repo started recently as a fun weekend project: I took my earlier [nanoGPT](https://github.com/karpathy/nanoGPT), tuned it to implement the Llama-2 architecture instead of GPT-2, and the meat of it was writing the C inference engine in [run.c](run.c). So the project is young and moving quickly. Hat tip to the awesome [llama.cpp](https://github.com/ggerganov/llama.cpp) for inspiring this project. Compared to llama.cpp, I wanted something super simple, minimal, and educational so I chose to hard-code the Llama 2 architecture and just roll one inference file of pure C with no dependencies.
Please note that this started recently as just a fun weekend project: I took my earlier [nanoGPT](https://github.com/karpathy/nanoGPT), tuned it to implement the Llama-2 architecture instead of GPT-2, and the meat of it was writing the C inference engine in [run.c](run.c). So the project is young and moving quickly. Hat tip to the awesome [llama.cpp](https://github.com/ggerganov/llama.cpp) for inspiring this project. I wanted something super minimal so I chose to hard-code the Llama 2 architecture, stick to fp32, and just roll one inference file of pure C with no dependencies.
## feel the magic
@@ -58,20 +56,18 @@ You can also prompt the model with a prefix or a number of additional command li
> One day, Lily met a Shoggoth. He was very shy, but was also very generous. Lily said “Hello Shoggy! Can I be your friend?” Shoggy was happy to have a friend and said “Yes, lets explore the universe together!” So they set off on a journey to explore the universe. As they travelled, Shoggy was happy to explain to Lily about all the wonderful things in the universe. At the end of the day, Lily and Shoggy had gathered lots of wonderful things from the universe, and they both felt very proud. They promised to explore the universe as one big pair and to never stop being generous to each other.
There is also an even better 110M param model available, see [models](#models).
Quick note on sampling, the recommendation for ~best results is to sample with `-t 1.0 -p 0.9`, i.e. temperature 1.0 (default) but also top-p sampling at 0.9 (default). Intuitively, top-p ensures that tokens with tiny probabilities do not get sampled, so we can't get "unlucky" during sampling, and we are less likely to go "off the rails" afterwards. More generally, to control the diversity of samples use either the temperature (i.e. vary `-t` between 0 and 1 and keep top-p off with `-p 0`) or the top-p value (i.e. vary `-p` between 0 and 1 and keep `-t 1`), but not both. Nice explainers on LLM sampling strategies include [this](https://peterchng.com/blog/2023/05/02/token-selection-strategies-top-k-top-p-and-temperature/), [this](https://docs.cohere.com/docs/controlling-generation-with-top-k-top-p) or [this](https://huggingface.co/blog/how-to-generate).
There is also an even better 110M param model available, see [models](#models). Quick note on sampling, the recommendation for good results is to use `-t 1.0 -p 0.9`, i.e. top-p sampling at 0.9 with temperature 1.0 (this is the default). To control the diversity of samples use either the temperature (i.e. vary `-t` between 0 and 1 and keep top-p off with `-p 0`) or the top-p value (i.e. vary `-p` between 0 and 1 and keep `-t 1`), but not both. Nice explainers on LLM sampling strategies include [this](https://peterchng.com/blog/2023/05/02/token-selection-strategies-top-k-top-p-and-temperature/), [this](https://docs.cohere.com/docs/controlling-generation-with-top-k-top-p) or [this](https://huggingface.co/blog/how-to-generate).
## Meta's Llama 2 models
As the neural net architecture is identical, we can also inference the Llama 2 models released by Meta. Sadly there is a bit of friction here due to licensing (I can't directly upload the checkpoints, I think). So Step 1, get the Llama 2 checkpoints by following the [Meta instructions](https://github.com/facebookresearch/llama). Once we have those checkpoints, we have to convert them into the llama2.c format.
For this we need to install the python dependencies (`pip install -r requirements.txt`) and then use the `export.py` file, e.g. for 7B model:
For this we need to install the python dependencies (`pip install -r requirements.txt`) and then use the `export_meta_llama_bin.py` file, e.g. for 7B model:
```bash
python export.py llama2_7b.bin --meta-llama path/to/llama/model/7B
python export_meta_llama_bin.py path/to/llama/model/7B llama2_7b.bin
```
The export will take ~10 minutes or so and generate a 26GB file (the weights of the 7B model in float32) called `llama2_7b.bin` in the current directory. It has been [reported](https://github.com/karpathy/llama2.c/pull/85) that despite efforts. I would not attempt to run anything above 7B right now for two reasons: first, 13B+ currently doesn't work because of integer flow in pointer arithmetic, which is yet to be fixed, and second, even if it were fixed, this repo is doing float32 inference right now, so it would be fairly unusably slow. Once the export is done, we can run it:
The export will take ~10 minutes or so and generate a 26GB file (the weights of the 7B model in float32) called `llama2_7b.bin` in the current directory. It has been [reported](https://github.com/karpathy/llama2.c/pull/85) that despite efforts, the 13B export currently doesn't work for unknown reasons (accepting PRs for fix). We can run the model as normal:
```bash
./run llama2_7b.bin
@@ -83,48 +79,15 @@ This ran at about 4 tokens/s compiled with [OpenMP](#OpenMP) on 96 threads on my
base models... ¯\\_(ツ)_/¯. Since we can inference the base model, it should be possible to also inference the chat model quite easily, and have a conversation with it. And if we can find a way to run 7B more efficiently, we can start adding LoRA to our training script, and going wild with finetunes all within the repo!
You can also chat with the Llama Chat models. Export the chat model exactly as above:
```bash
python export.py llama2_7b_chat.bin --meta-llama /path/to/7B-chat
```
Then chat with it by specifying the chat mode using the `-m` flag, e.g.:
```bash
./run llama2_7b_chat.bin -m chat
```
You can also try Meta's Code Llama models even if support for them is incomplete. In particular, some hyperparameters changed (e.g. the constant in RoPE layer), so the inference is not exactly correct and a bit buggy right now. Looking into fixes. Make sure to build the tokenizer for the plain and instruct variants and pass it when doing inference.
```bash
python export.py codellama2_7b.bin --meta-llama /path/to/CodeLlama-7b
python tokenizer.py --tokenizer-model=/path/to/CodeLlama-7b/tokenizer.model
./run codellama2_7b.bin -z /path/to/CodeLlama-7b/tokenizer.bin
```
Chat with Code Llama Instruct:
```bash
python export.py codellama2_7b_instruct.bin --meta-llama /path/to/CodeLlama-7b-Instruct
python tokenizer.py --tokenizer-model=/path/to/CodeLlama-7b-Instruct/tokenizer.model
./run codellama2_7b_instruct.bin -m chat -z /path/to/CodeLlama-7b-Instruct/tokenizer.bin
```
## hugginface models
We can load any huggingface models that use the Llama 2 architecture. See the script [export.py](export.py) and the `--hf` flag to export the model .bin file.
## models
For the sake of examples of smaller, from-scratch models, I trained a small model series on TinyStories. All of these trained in a few hours on my training setup (4X A100 40GB GPUs). The 110M took around 24 hours. I am hosting them on huggingface hub [tinyllamas](https://huggingface.co/karpathy/tinyllamas), both in the original PyTorch .pt, and also in the llama2.c format .bin:
| model | dim | n_layers | n_heads | n_kv_heads | max context length | parameters | val loss | download
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
| 260K | 64 | 5 | 8 | 4 | 512 | 260K | 1.297 | [stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K)
| OG | 288 | 6 | 6 | 6 | 256 | 15M | 1.072 | [stories15M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin) |
| 42M| 512 | 8 | 8 | 8 | 1024 | 42M | 0.847 | [stories42M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.bin) |
| 110M| 768 | 12 | 12 | 12 | 1024 | 110M | 0.760 | [stories110M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.bin) |
| model | dim | n_layers | n_heads | max context length | parameters | val loss | download
| --- | --- | --- | --- | --- | --- | --- | --- |
| OG | 288 | 6 | 6 | 256 | 15M | 1.072 | [stories15M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin) |
| 42M| 512 | 8 | 8 | 1024 | 42M | 0.847 | [stories42M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.bin) |
| 110M| 768 | 12 | 12 | 1024 | 110M | 0.760 | [stories110M.bin](https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.bin) |
You'll notice that the 110M model is equivalent to GPT-1 in size. Alternatively, this is also the smallest model in the GPT-2 series (`GPT-2 small`), except the max context length is only 1024 instead of 2048. The only notable changes from GPT-1/2 architecture is that Llama uses RoPE relatively positional embeddings instead of absolute/learned positional embeddings, a bit more fancy SwiGLU non-linearity in the MLP, RMSNorm instead of LayerNorm, bias=False on all Linear layers, and is optionally multiquery (but this is not yet supported in llama2.c).
@@ -167,53 +130,15 @@ Watch the tokens stream by, fun! We can also run the PyTorch inference script fo
```bash
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt -P out15M
python sample.py --checkpoint=out15M/stories15M.pt
mv out15M/stories15M.pt out15M/ckpt.pt # sorry the sample script current assumes this directory structure / filename...
python sample.py --out_dir=out15M
```
Which gives the same results.
## custom tokenizers
In everything above, we've assumed the custom Lllama 2 tokenizer with 32,000 tokens. However, in many boutique LLMs, using vocabulary this big might be an overkill. If you have a small application you have in mind, you might be much better off training your own tokenizers. This can make everything nicer - with smaller vocabs your model has fewer parameters (because the token embedding table is a lot smaller), the inference is faster (because there are fewer tokens to predict), and your average sequence length per example could also get smaller (because the compression is a lot more efficient on your data). So let's see how we train a custom tokenizer.
By default, to pretokenize the tinystories dataset we had to run, in order:
Which gives the same results. More detailed testing will be done in `test_all.py`. Currently you will need two files to test or sample: both the .bin file, and the .ckpt file inside a directory (see `test_all.py` for details). Sorry this is a bit janky right now, I have to think through running the tests without having to download 200MB of data. But run the tests with pytest:
```bash
$ pytest
```
python tinystories.py download
python tinystories.py pretokenize
```
The `pretokenize` stage here loads the Llama 2 tokenizer (vocab size 32,000) and uses it to convert the downloaded text into integers, and saves that to file. We now change this as follows, to train an example 4096-token tokenizer:
```
python tinystories.py download
python tinystories.py train_vocab --vocab_size=4096
python tinystories.py pretokenize --vocab_size=4096
```
The `train_vocab` stage will call the `sentencepiece` library to train the tokenizer, storing it in a new file `data/tok4096.model`. I tried to reproduce as well as I could the settings that (I think) Meta used to train their vocabulary. This uses the Byte Pair Encoding algorithm that starts out with raw utf8 byte sequences of the text data and then iteratively merges the most common consecutive pairs of tokens to form the vocabulary. Inspect the `tinystories.py` file - the custom tokenizers are stored in a special directory structure indexed by the vocab size.
A quick note of interest is that vocab size of 4096 trained specifically on tinystories creates integer sequences with about the same sequence length per example as the default Llama 2 tokenizer of 32000 tokens! This means that our custom, tailored tokenizer is a lot better adapted to our specific text, and can compress it very effectively. So our trained models are smaller and faster.
Now that we have pretokenized the dataset with our custom tokenizer, we can train the model. The training script `train.py` doesn't care about the exact tokens, it only cares about the vocabulary size so it can correctly initialize the model. So when training your model, make sure to pass in
```
python train.py --vocab_source=custom --vocab_size=4096
```
(The defaults are `llama2` and `32000` respectively, which indicates the default Llama 2 tokenizer). This trains the model. Finally we are ready to run inference with our `run.c` script. For that we need two things. Number one, we have to export our tokenizer in the `.bin` format, do that with:
```
python tokenizer.py --tokenizer-model=data/tok4096.model
```
This writes the tokenizer to `data/tok4096.bin`. Now we can run inference, pointing it to this tokenizer using the `-z` flag:
```
./run out/model.bin -z data/tok4096.bin
```
This should print the samples. If you leave out the `-z` flag, it will use the default Llama 2 tokenizer, which would generate a good sequence of integers, but they would get translated using a different vocabulary to text, so it would look like gibberish.
## performance
@@ -235,8 +160,9 @@ You can also experiment with replacing `gcc` with `clang`.
If compiling with gcc, try experimenting with `-funroll-all-loops`, see PR [#183](https://github.com/karpathy/llama2.c/pull/183)
**OpenMP**. Big improvements can also be achieved by compiling with OpenMP, which "activates" the `#pragma omp parallel for` inside the matmul and attention, allowing the work in the loops to be split up over multiple processors.
You'll need to install the OpenMP library and the clang compiler first (e.g. `apt install clang libomp-dev` on ubuntu). Then you can compile with `make runomp`, which does:
### OpenMP
Big improvements can also be achieved by compiling with OpenMP, which "activates" the `#pragma omp parallel for` inside the matmul and attention, allowing the work in the loops to be split up over multiple processors.
You'll need to install the OpenMP library and the clang compiler first (e.g. `apt install clang libomp-dev` on ubuntu). I was not able to get improvements from OpenMP on my MacBook, though. Then you can compile with `make runomp`, which does:
```bash
clang -Ofast -fopenmp -march=native run.c -lm -o run
@@ -248,8 +174,7 @@ When you run inference make sure to use OpenMP flags to set the number of thread
OMP_NUM_THREADS=4 ./run out/model.bin
```
Depending on your system resources you may want to tweak these hyperparameters and use more threads. But more is not always better, usually this is a bit U shaped. In particular, if your CPU has SMT (multithreading), try setting the number of threads to the number of physical cores rather than logical cores. The performance difference can be large due to cache thrashing and communication overhead. The PyTorch documentation [CPU specific optimizations
](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#cpu-specific-optimizations) has some good information that applies here too.
Depending on your system resources you may want to tweak these hyperparameters and use more threads. But more is not always better, usually this is a bit U shaped.
## platforms
@@ -257,27 +182,6 @@ On **Windows**, use `build_msvc.bat` in a Visual Studio Command Prompt to build
On **Centos 7**, **Amazon Linux 2018** use `rungnu` Makefile target: `make rungnu` or `make runompgnu` to use openmp.
On **Mac**, use clang from brew for openmp build. Install clang as `brew install llvm` and use the installed clang binary to compile with openmp: `make runomp CC=/opt/homebrew/opt/llvm/bin/clang`
## tests
You can run tests simply with pytest:
```bash
$ pip install pytest
$ pytest
```
This will currently invoke two tests inside `test_all.py`, which forward the model in both C and Python for 200 steps and check the output against a known good expected output. The tests currently run in only a few seconds, but will have to download and cache the stories260K models in a temporary `test` directory (only ~2MB download).
There are also some tests in C, in the file [test.c](test.c). You can run these with `make testcc`, or to see more stuff printed:
```
make testcc VERBOSITY=1
```
Call for help: help add more tests.
## ack
I trained the llama2.c storyteller models on a 4X A100 40GB box graciously provided by the excellent [Lambda labs](https://lambdalabs.com/service/gpu-cloud), thank you.
@@ -310,8 +214,6 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
- [llama2.rs](https://github.com/gaxler/llama2.rs) by @[gaxler](https://github.com/gaxler): a Rust port of this project
- [llama2.rs](https://github.com/leo-du/llama2.rs) by @[leo-du](https://github.com/leo-du): A Rust port of this project
- [llama2-rs](https://github.com/danielgrittner/llama2-rs) by @[danielgrittner](https://github.com/danielgrittner): a Rust port of this project
- [llama2.rs](https://github.com/lintian06/llama2.rs) by @[lintian06](https://github.com/lintian06): A Rust port of this project
- [pecca.rs](https://github.com/rahoua/pecca-rs) by @[rahoua](https://github.com/rahoua): A Rust port leveraging [ndarray](https://github.com/rust-ndarray/ndarray), supports BLAS.
- Go
- [go-llama2](https://github.com/tmc/go-llama2) by @[tmc](https://github.com/tmc): a Go port of this project
- [llama2.go](https://github.com/nikolaydubina/llama2.go) by @[nikolaydubina](https://github.com/nikolaydubina): a Go port of this project
@@ -324,7 +226,6 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
- [llama2.cpp](https://github.com/leloykun/llama2.cpp) by @[leloykun](https://github.com/leloykun): a C++ port of this project
- JavaScript
- [llama2.js](https://github.com/epicure/llama2.js) by @[epicure](https://github.com/epicure): a JavaScript port of this project
- [llama2.ts](https://github.com/wizzard0/llama2.ts) by @[oleksandr_now](https://twitter.com/oleksandr_now): a TypeScript port of this project. Full Llama2-7B capable.
- [llama2.c-emscripten](https://github.com/gohai/llama2.c-emscripten) by @[gohai](https://github.com/gohai): Emscripten (JavaScript) port, based on @ggerganov's initial prototype
- Zig
- [llama2.zig](https://github.com/cgbur/llama2.zig) by @[cgbur](https://github.com/cgbur): A Zig port of this project
@@ -342,21 +243,18 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
- [llama2.py](https://github.com/tairov/llama2.py) by @[tairov](https://github.com/tairov): a simple one file pure Python port of this project with zero dependencies
- C#
- [llama2.cs](https://github.com/trrahul/llama2.cs) by @[trrahul](https://github.com/trrahul): a C# port of this project
- Dart
- [llama2.dart](https://github.com/yiminghan/llama2.dart) by @[yiminghan](https://github.com/yiminghan/llama2.dart): one-file dart port of this project, works with Flutter!
- WebAssembly
- [icpp-llm](https://github.com/icppWorld/icpp-llm): LLMs for the Internet Computer
- [llama2.c - Llama 2 Everywhere](https://github.com/trholding/llama2.c) by @[trholding](https://github.com/trholding): Standalone, Bootable & Portable Binary Llama 2
- [llama2.c-zh - Bilingual Chinese and English](https://github.com/chenyangMl/llama2.c-zh) by @[chenyangMl](https://github.com/chenyangMl): Expand tokenizer to support training and inference in both Chinese and English
## unsorted todos
- add support in run.c of reading version 1+ files from export, later deprecate "version 0"
- runq.c (int8 quantization) add
- run.cu (CUDA) investigate and merge
- add more tests inside [test.c](test.c)
- add Engine class for use in sample.py that does efficient inference in PyTorch, e.g. KV cache keeping
- make it easier to add a new dataset with not too much pain
- add multiquery support into run.c
- add custom bpe training code and the ability to train a smaller vocabulary (32K is to much)
- should calculate freq_cis online in the script run.c instead of loading them
- int4/8 quantization
- export the model in a more sensible output format with a proper header, etc.
- train a tiny Llama test model (committed to repo) and use it as reference in unit tests
- support Llama 2 7B Chat models and tune run.c to Chat UI/UX
- llama2.cu investigate and merge
- (LoRA) finetuning and export of Llama 2 models
## License
-58
View File
@@ -1,58 +0,0 @@
# stories260K
[Stories260K huggginface link](https://huggingface.co/karpathy/tinyllamas)
The 260K model is a tiny model used for testing, and was trained as follows:
```
python train.py \
--out_dir="outmini" \
--batch_size=128 \
--max_seq_len=512 \
--gradient_accumulation_steps=1 \
--vocab_source="custom" \
--vocab_size=512 \
--dim=64 \
--n_layers=5 \
--n_heads=8 \
--n_kv_heads=4 \
--multiple_of=4 \
--learning_rate=1e-3 \
--dropout=0.05 \
--weight_decay=0.01 \
--max_iters=100000 \
--beta2=0.99 \
--warmup_iters=1000 \
--eval_interval=2000 \
--eval_iters=100 \
--compile=True
```
You'll notice that `n_kv_heads` is 4 while `n_heads` is 8, so two heads at a time share their key,value projections, i.e. this model is 2X multiquery. You'll also notice that we're using a custom tokenizer with 512 tokens. The model trained for ~10 minutes (?) on my A100 and achieves validation loss of 1.2968.
Sampling this model at temperature 0.0 (i.e. deterministic greedy argmax sampling) gives:
```
$ ./run stories260K/stories260K.bin -z stories260K/tok512.bin -t 0.0
Once upon a time, there was a little girl named Lily. She loved to play outside in the park. One day, she saw a big, red ball. She wanted to play with it, but it was too high.
Lily's mom said, "Lily, let's go to the park." Lily was sad and didn't know what to do. She said, "I want to play with your ball, but I can't find it."
Lily was sad and didn't know what to do. She said, "I'm sorry, Lily. I didn't know what to do."
Lily didn't want to help her mom, so she said, "I'm sorry, mom. I didn't know what to do." Her mom said, "Don't worry, Lily. We can help you.
```
You can reproduce the same in Python by running `sample.py`:
```
$ python sample.py --checkpoint=stories260K/stories260K.pt --tokenizer=stories260K/tok512.model --temperature=0.0 --max_new_tokens=257
```
I hardcoded max tokens to be 257 manually because the `sample.py` script doesn't currently terminate on the special BOS token like the run.c script does. Sampling at 1.0 with topp of 0.9 gives a bit more reasonable samples:
```
$ ./run stories260K/stories260K.bin -z stories260K/tok512.bin -t 1.0 -p 0.9 -s 133742
Once upon a time, there was a little boy named Timmy. Timmy loved to play with his toys and eat sandwiches. One day, Timmy's mom told him it was time to rest for a while. Timmy's friend Billy came over and took him a down.
Timmy's mom saw that Timmy was sad, but Timmy said, "I didn't understand what is it! We need to find some leafs." Timmy thought about it and took a deep breath on a spoon. He hoped it was important to be kind and continued to find its image next time.
After they finished getting, Timmy's dad came up to his house and promised to help Timmy.
```
Hey you can't expect too much from a 260K parameter model. I'm even mildly shocked we get this far :D
-99
View File
@@ -1,99 +0,0 @@
# training llama tokenizer
How does Meta train their sentencepiece tokenizer? You can print the config as follows:
```python
import sentencepiece.sentencepiece_model_pb2
mp = sentencepiece.sentencepiece_model_pb2.ModelProto()
mp.ParseFromString(open("tokenizer.model", "rb").read())
print(mp.trainer_spec)
print(mp.normalizer_spec)
```
this gives:
```
trainer_spec {
input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
model_type: BPE
vocab_size: 32000
self_test_sample_size: 0
input_format: "text"
character_coverage: 0.9999499917030334
input_sentence_size: 200000000
seed_sentencepiece_size: 1000000
shrinking_factor: 0.75
num_threads: 80
num_sub_iterations: 2
max_sentence_length: 4192
shuffle_input_sentence: true
max_sentencepiece_length: 16
split_by_unicode_script: true
split_by_whitespace: true
split_by_number: true
treat_whitespace_as_suffix: false
split_digits: true
allow_whitespace_only_pieces: true
vocabulary_output_piece_score: true
hard_vocab_limit: true
use_all_vocab: false
byte_fallback: true
required_chars: ""
unk_id: 0
bos_id: 1
eos_id: 2
pad_id: -1
unk_surface: " \342\201\207 "
unk_piece: "<unk>"
bos_piece: "<s>"
eos_piece: "</s>"
pad_piece: "<pad>"
train_extremely_large_corpus: false
enable_differential_privacy: false
differential_privacy_noise_level: 0.0
differential_privacy_clipping_threshold: 0
}
normalizer_spec {
name: "identity"
precompiled_charsmap: ""
add_dummy_prefix: true
remove_extra_whitespaces: false
normalization_rule_tsv: ""
}
```
We can use the sentencepiece spm_train to train the same models, but optionally smaller. Here are their [options docs](https://github.com/google/sentencepiece/blob/master/doc/options.md) we can refer to. It's not much but it helps.
We'll depart on one setting, I recommend changing `character_coverage` -> 1.0. We also want to make sure to note the following important settings that come up in the paper and are not necessarily the default sentencepiece settings:
```
--split-digits = true
--allow_whitespace_only_pieces = true
--byte_fallback = true
--normalization_rule_name = identity
```
With this in mind we can train a sentencepiece vocab in what I believe is probably the same to how Meta trained theirs as:
```
spm_train --input="$input" \
--model_prefix="$model_prefix" \
--model_type=bpe \
--vocab_size="$vocab_size" \
--self_test_sample_size=0 \
--input_format="text" \
--character_coverage=1.0 \
--num_threads="$(nproc)" \
--split_digits=true \
--allow_whitespace_only_pieces=true \
--byte_fallback=true \
--unk_surface=" \342\201\207 " \
--normalization_rule_name=identity \
```
Where $input is the input file, $model_prefix is the output path prefix, vocab_size is the desired vocab, and we're by default taking over the CPU resources of the machine.
Lastly note that sentencepiece is weird and expects "sentences" delimited by newlines as the input. You can't just put in a massive block of text. And they have a hyperparameter that constols the maximum size of a "sentence". Fwiw I really dislike this design choice around a weird concept of a "sentence". It should just be block of text with no assumptions. But here we are.
Look into the file `tinystories.py` where we train the vocab in the same way, but using Python bindings instead.
-471
View File
@@ -1,471 +0,0 @@
"""
This script has functions and utilties for model export.
Basically, we have a bunch of versions of the model, and we
want to export them to .bin files to be read from and inferenced in C.
Among the "input" versions of PyTorch files/models:
- Official Llama 2 weights released by Meta
- Huggingface weights available on the hub
- llama2.c (this repo) trained models
Among the "output" versions of .bin files:
- v0: Legacy files of the original llama2.c repo (will eventually be DEPRECATED)
- v1-vN: Improved .bin files with a proper header, cache alignment, etc.
This script aspires to provide all of these conversions.
"""
import os
import gzip
import shutil
import struct
import argparse
import json
from pathlib import Path
import numpy as np
import torch
from torch import nn
from model import ModelArgs, Transformer
# -----------------------------------------------------------------------------
# common utilities
def serialize_fp32(file, tensor):
""" writes one fp32 tensor to file that is open in wb mode """
d = tensor.detach().cpu().view(-1).to(torch.float32).numpy()
b = struct.pack(f'{len(d)}f', *d)
file.write(b)
def serialize_int8(file, tensor):
""" writes one int8 tensor to file that is open in wb mode """
d = tensor.detach().cpu().view(-1).numpy().astype(np.int8)
b = struct.pack(f'{len(d)}b', *d)
file.write(b)
def quantize_q80(w, group_size):
"""
takes a tensor and returns the Q8_0 quantized version
i.e. symmetric quantization into int8, range [-127,127]
"""
assert w.numel() % group_size == 0
ori_shape = w.shape
w = w.float() # convert to float32
w = w.reshape(-1, group_size)
# find the max in each group
wmax = torch.abs(w).max(dim=1).values
# calculate the scaling factor such that float = quant * scale
scale = wmax / 127.0
# scale into range [-127, 127]
quant = w / scale[:,None]
# round to nearest integer
int8val = torch.round(quant).to(torch.int8)
# dequantize by rescaling
fp32val = (int8val.float() * scale[:,None]).view(-1)
fp32valr = fp32val.reshape(-1, group_size)
# calculate the max error in each group
err = torch.abs(fp32valr - w).max(dim=1).values
# find the max error across all groups
maxerr = err.max().item()
return int8val, scale, maxerr
# -----------------------------------------------------------------------------
# legacy
def legacy_export(model, filepath):
""" Original export of llama2.c bin files, i.e. version v0 """
out_file = open(filepath, 'wb')
# first write out the header
hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0]
p = model.params
shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight)
# legacy format uses negative/positive vocab size as a shared classifier flag
if not shared_classifier:
p.vocab_size = -p.vocab_size
n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads
header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,
n_kv_heads, p.vocab_size, p.max_seq_len)
out_file.write(header)
# next write out the embedding weights
serialize_fp32(out_file, model.tok_embeddings.weight)
# now all the layers
# attention weights
for layer in model.layers:
serialize_fp32(out_file, layer.attention_norm.weight)
for layer in model.layers:
serialize_fp32(out_file, layer.attention.wq.weight)
for layer in model.layers:
serialize_fp32(out_file, layer.attention.wk.weight)
for layer in model.layers:
serialize_fp32(out_file, layer.attention.wv.weight)
for layer in model.layers:
serialize_fp32(out_file, layer.attention.wo.weight)
# ffn weights
for layer in model.layers:
serialize_fp32(out_file, layer.ffn_norm.weight)
for layer in model.layers:
serialize_fp32(out_file, layer.feed_forward.w1.weight)
for layer in model.layers:
serialize_fp32(out_file, layer.feed_forward.w2.weight)
for layer in model.layers:
serialize_fp32(out_file, layer.feed_forward.w3.weight)
# final rmsnorm
serialize_fp32(out_file, model.norm.weight)
# freqs_cis
serialize_fp32(out_file, model.freqs_cos[:p.max_seq_len])
serialize_fp32(out_file, model.freqs_sin[:p.max_seq_len])
# final classifier weights
if not shared_classifier:
serialize_fp32(out_file, model.output.weight)
# write to binary file
out_file.close()
print(f"wrote {filepath}")
# -----------------------------------------------------------------------------
# new version
def version1_export(model, filepath):
"""
Export the model weights in full float32 .bin file to be read from C.
This is same as legacy_export, but with a proper header.
"""
version = 1
out_file = open(filepath, 'wb')
# first write out the header. the header will be 256 bytes
# 1) write magic, which will be uint32 of "ak42" in ASCII
out_file.write(struct.pack('I', 0x616b3432))
# 2) write version, which will be int
out_file.write(struct.pack('i', version))
# 3) write the params, which will be 7 ints
p = model.params
hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0]
n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads
header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,
n_kv_heads, p.vocab_size, p.max_seq_len)
out_file.write(header)
# 4) write some other flags
shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight)
out_file.write(struct.pack('B', int(shared_classifier)))
pad = 256 - out_file.tell() # pad rest with zeros; tell returns current pos
assert pad >= 0
out_file.write(b'\0' * pad)
# now let's write out all the params
weights = [
*[layer.attention_norm.weight for layer in model.layers],
*[layer.ffn_norm.weight for layer in model.layers],
model.norm.weight,
model.tok_embeddings.weight,
*[layer.attention.wq.weight for layer in model.layers],
*[layer.attention.wk.weight for layer in model.layers],
*[layer.attention.wv.weight for layer in model.layers],
*[layer.attention.wo.weight for layer in model.layers],
*[layer.feed_forward.w1.weight for layer in model.layers],
*[layer.feed_forward.w2.weight for layer in model.layers],
*[layer.feed_forward.w3.weight for layer in model.layers],
]
if not shared_classifier:
weights.append(model.output.weight)
for w in weights:
serialize_fp32(out_file, w)
# write to binary file
out_file.close()
print(f"wrote {filepath}")
def version2_export(model, filepath, group_size=64):
"""
Export the model weights in Q8_0 into .bin file to be read from C.
That is:
- quantize all weights to symmetric int8, in range [-127, 127]
- all other tensors (the rmsnorm params) are kept and exported in fp32
- quantization is done in groups of group_size to reduce the effects of any outliers
"""
version = 2
# let's first do some validation for this export type
while model.params.dim % group_size != 0:
group_size //= 2
print(f"BACKOFF: reducing group size to {group_size} to fit hidden_dim")
weights = [
model.tok_embeddings.weight,
*[layer.attention.wq.weight for layer in model.layers],
*[layer.attention.wk.weight for layer in model.layers],
*[layer.attention.wv.weight for layer in model.layers],
*[layer.attention.wo.weight for layer in model.layers],
*[layer.feed_forward.w1.weight for layer in model.layers],
*[layer.feed_forward.w2.weight for layer in model.layers],
*[layer.feed_forward.w3.weight for layer in model.layers],
]
shared_classifier = torch.equal(model.tok_embeddings.weight, model.output.weight)
if not shared_classifier:
weights.append(model.output.weight)
for w in weights:
assert w.numel() % group_size == 0, f"weight {i} has numel {w.numel()}, not a multiple of group_size {group_size}"
# write
out_file = open(filepath, 'wb')
# first write out the header. the header will be 256 bytes
# 1) write magic, which will be uint32 of "ak42" in ASCII
out_file.write(struct.pack('I', 0x616b3432))
# 2) write version, which will be int
out_file.write(struct.pack('i', version))
# 3) write the params, which will be 7 ints
p = model.params
hidden_dim = model.layers[0].feed_forward.w1.weight.shape[0]
n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads
header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,
n_kv_heads, p.vocab_size, p.max_seq_len)
out_file.write(header)
# 4) write some other flags
out_file.write(struct.pack('B', int(shared_classifier)))
out_file.write(struct.pack('i', group_size)) # group size used for quantization
pad = 256 - out_file.tell() # pad rest with zeros; tell returns current pos
assert pad >= 0
out_file.write(b'\0' * pad)
# now that the header is done, let's write out the model
# first let's write out all the params that we are keeping in fp32: the norms
for layer in model.layers: # attention norms
serialize_fp32(out_file, layer.attention_norm.weight)
for layer in model.layers: # MLP norms
serialize_fp32(out_file, layer.ffn_norm.weight)
serialize_fp32(out_file, model.norm.weight) # final pre-classifier norm
# now let's write out all the params that we are quantizing to Q8_0
# note we skip classifier weights, which are shared with the embedding
ew = []
scales = []
for i, w in enumerate(weights):
# quantize this weight
q, s, err = quantize_q80(w, group_size)
# save the int8 weights to file
serialize_int8(out_file, q) # save the tensor in int8
scales.append(s) # we'll do all the scales after all the qs
# logging
ew.append((err, w.shape))
print(f"{i+1}/{len(weights)} quantized {tuple(w.shape)} to Q8_0 with max error {err}")
# save the scaling factors in fp32 here
# this is done to keep all the weights contiquous, making pointer arithmetic easier in C
for s in scales:
serialize_fp32(out_file, s)
# print the highest error across all weights, should be very small, e.g. O(~0.001)
ew.sort(reverse=True)
print(f"max quantization group error across all weights: {ew[0][0]}")
# write to binary file
out_file.close()
print(f"wrote {filepath}")
# -----------------------------------------------------------------------------
# Load / import functions
def load_checkpoint(checkpoint):
# load the provided model checkpoint
checkpoint_dict = torch.load(checkpoint, map_location='cpu')
gptconf = ModelArgs(**checkpoint_dict['model_args'])
model = Transformer(gptconf)
state_dict = checkpoint_dict['model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
if k.startswith(unwanted_prefix):
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict, strict=False)
model.eval()
return model
def load_meta_model(model_path):
params_path = os.path.join(model_path, 'params.json')
with open(params_path) as f:
params = json.load(f)
print(params)
model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
models = [torch.load(p, map_location='cpu') for p in model_paths]
def concat_weights(models):
state_dict = {}
for name in list(models[0]):
tensors = [model[name] for model in models]
if len(tensors) == 1 or len(tensors[0].shape) == 1:
state_dict[name] = tensors[0]
continue
is_axis_1 = (
name.startswith('tok_embeddings.')
or name.endswith('.attention.wo.weight')
or name.endswith('.feed_forward.w2.weight')
)
axis = 1 if is_axis_1 else 0
state_dict[name] = torch.cat(tensors, dim=axis)
for model in models:
del model[name]
return state_dict
state_dict = concat_weights(models)
del models
# set ModelArgs
config = ModelArgs()
config.dim = params["dim"]
config.n_layers = params["n_layers"]
config.n_heads = params["n_heads"]
config.n_kv_heads = params.get('n_kv_heads') or params['n_heads']
config.multiple_of = params["multiple_of"]
config.norm_eps = params["norm_eps"]
config.vocab_size = state_dict['tok_embeddings.weight'].shape[0]
config.max_seq_len = 2048
# create a new Transformer object and set weights
model = Transformer(config)
model.tok_embeddings.weight = nn.Parameter(state_dict['tok_embeddings.weight'])
model.norm.weight = nn.Parameter(state_dict['norm.weight'])
for layer in model.layers:
i = layer.layer_id
layer.attention_norm.weight = nn.Parameter(state_dict[f'layers.{i}.attention_norm.weight'])
layer.attention.wq.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wq.weight'])
layer.attention.wk.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wk.weight'])
layer.attention.wv.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wv.weight'])
layer.attention.wo.weight = nn.Parameter(state_dict[f'layers.{i}.attention.wo.weight'])
layer.ffn_norm.weight = nn.Parameter(state_dict[f'layers.{i}.ffn_norm.weight'])
layer.feed_forward.w1.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w1.weight'])
layer.feed_forward.w2.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w2.weight'])
layer.feed_forward.w3.weight = nn.Parameter(state_dict[f'layers.{i}.feed_forward.w3.weight'])
# final classifier
model.output.weight = nn.Parameter(state_dict['output.weight'])
model.eval()
return model
def load_hf_model(model_path):
try:
from transformers import AutoModelForCausalLM
except ImportError:
print("Error: transformers package is required to load huggingface models")
print("Please run `pip install transformers` to install it")
return None
# load HF model
hf_model = AutoModelForCausalLM.from_pretrained(model_path)
hf_dict = hf_model.state_dict()
# convert LlamaConfig to ModelArgs
config = ModelArgs()
config.dim = hf_model.config.hidden_size
config.n_layers = hf_model.config.num_hidden_layers
config.n_heads = hf_model.config.num_attention_heads
config.n_kv_heads = hf_model.config.num_attention_heads
config.vocab_size = hf_model.config.vocab_size
config.hidden_dim = hf_model.config.intermediate_size
config.norm_eps = hf_model.config.rms_norm_eps
config.max_seq_len = hf_model.config.max_position_embeddings
# create a new Transformer object and set weights
model = Transformer(config)
model.tok_embeddings.weight = nn.Parameter(hf_dict['model.embed_tokens.weight'])
model.norm.weight = nn.Parameter(hf_dict['model.norm.weight'])
# huggingface permutes WQ and WK, this function reverses it
def permute_reverse(w, n_heads=config.n_heads, dim1=config.dim, dim2=config.dim):
return w.view(n_heads, 2, dim1 // n_heads // 2, dim2).transpose(1, 2).reshape(dim1, dim2)
for layer in model.layers:
i = layer.layer_id
layer.attention_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.input_layernorm.weight'])
layer.attention.wq.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.q_proj.weight']))
layer.attention.wk.weight = nn.Parameter(permute_reverse(hf_dict[f'model.layers.{i}.self_attn.k_proj.weight']))
layer.attention.wv.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.v_proj.weight'])
layer.attention.wo.weight = nn.Parameter(hf_dict[f'model.layers.{i}.self_attn.o_proj.weight'])
layer.ffn_norm.weight = nn.Parameter(hf_dict[f'model.layers.{i}.post_attention_layernorm.weight'])
layer.feed_forward.w1.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.gate_proj.weight'])
layer.feed_forward.w2.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.down_proj.weight'])
layer.feed_forward.w3.weight = nn.Parameter(hf_dict[f'model.layers.{i}.mlp.up_proj.weight'])
# final classifier
model.output.weight = nn.Parameter(hf_dict['lm_head.weight'])
model.eval()
return model
# -----------------------------------------------------------------------------
# API entrypoint
def model_export(model, filepath, version):
if version == 0:
legacy_export(model, filepath)
elif version == 1:
version1_export(model, filepath)
elif version == 2:
version2_export(model, filepath)
else:
raise ValueError(f"unknown version {version}")
def torchscript_export(model, filepath, zero_params=False, gzip_output=False):
"""
(This was submitted via a PR earlier. Leaving it here, but "orphaned" for now)
Saves the model as a TorchScript.
The resulting file can be loaded in C++ code and then used for training or
inference with:
#include <torch/script.h>
torch::jit::Module module = torch::jit::load("model.pt")
Note that the serialized model includes the initial parameters and with the default
ModelArgs the file is 59M and gzips down to 55M. If you want to serialize/distribute
the model parameters separately you can zero out the parameters before saving it and
it will gzip down to 780K.
"""
# If requested zero params before saving the model. This is useful in
# conjunction with gzip_output.
if zero_params:
for p in model.parameters():
p.detach().zero_()
torch.jit.save(torch.jit.script(model), filepath)
if gzip_output:
with open(filepath, "rb") as f_in:
with gzip.open(f"{filepath}.gz", "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
os.unlink(filepath)
# -----------------------------------------------------------------------------
# CLI entrypoint
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("filepath", type=str, help="the output filepath")
parser.add_argument("--version", default=0, type=int, help="the version to export with")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--checkpoint", type=str, help="model checkpoint, .pt file")
group.add_argument("--meta-llama", type=str, help="meta llama model path")
group.add_argument("--hf", type=str, help="huggingface model path")
args = parser.parse_args()
if args.checkpoint:
model = load_checkpoint(args.checkpoint)
elif args.meta_llama:
model = load_meta_model(args.meta_llama)
elif args.hf:
model = load_hf_model(args.hf)
if model is None:
parser.error("Can't load input model!")
# export
model_export(model, args.filepath, args.version)
+112
View File
@@ -0,0 +1,112 @@
"""
This script exports the Llama 2 weights in llama2c.bin format.
"""
import os
import sys
import struct
from pathlib import Path
import json
import torch
from model import precompute_freqs_cis
def export(p, state_dict, filepath='model.bin'):
"""export the model weights in fp32 into .bin file to be read from C"""
f = open(filepath, 'wb')
def serialize(key):
print(f"writing {key}...")
t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy()
f.write(memoryview(t))
del state_dict[key]
# first write out the header
hidden_dim = state_dict['layers.0.feed_forward.w1.weight'].shape[0]
p['vocab_size'] = 32000
p['max_seq_len'] = 2048
n_kv_heads = p.get('n_kv_heads') or p['n_heads']
header = struct.pack(
'iiiiiii',
p['dim'], hidden_dim, p['n_layers'], p['n_heads'],
n_kv_heads, -p['vocab_size'], p['max_seq_len']
)
# NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present
# in the checkpoint and should be loaded.
f.write(header)
# next write out the embedding weights
print("writing tok_embeddings...")
serialize('tok_embeddings.weight')
# now all the layers
# attention weights
for i in range(p['n_layers']): serialize(f'layers.{i}.attention_norm.weight')
for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wq.weight')
for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wk.weight')
for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wv.weight')
for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wo.weight')
# ffn weights
for i in range(p['n_layers']): serialize(f'layers.{i}.ffn_norm.weight')
for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w1.weight')
for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w2.weight')
for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w3.weight')
# final rmsnorm
serialize('norm.weight')
# freqs_cos, freqs_sin
freqs_cos, freqs_sin = precompute_freqs_cis(p['dim'] // p['n_heads'], p['max_seq_len'] * 2)
state_dict['freqs_cos'] = freqs_cos[:p['max_seq_len']]
state_dict['freqs_sin'] = freqs_sin[:p['max_seq_len']]
serialize('freqs_cos')
serialize('freqs_sin')
# finally write the output weights
serialize('output.weight')
f.close()
print(f"wrote {filepath}")
def concat_weights(models):
state_dict = {}
for name in list(models[0]):
tensors = [model[name] for model in models]
if len(tensors) == 1 or len(tensors[0].shape) == 1:
state_dict[name] = tensors[0]
continue
is_axis_1 = (
name.startswith('tok_embeddings.')
or name.endswith('.attention.wo.weight')
or name.endswith('.feed_forward.w2.weight')
)
axis = 1 if is_axis_1 else 0
state_dict[name] = torch.cat(tensors, dim=axis)
for model in models:
del model[name]
return state_dict
def load_and_export(model_path, output_path):
params_path = os.path.join(model_path, 'params.json')
with open(params_path) as f:
params = json.load(f)
print(params)
model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
models = [torch.load(p, map_location='cpu') for p in model_paths]
state_dict = concat_weights(models)
del models
export(params, state_dict, output_path)
if __name__ == '__main__':
if len(sys.argv) == 1:
print('[Llama model folder path] [output path]')
exit()
model_path = sys.argv[1]
output_path = sys.argv[2]
load_and_export(model_path, output_path)
+57 -10
View File
@@ -11,14 +11,12 @@ from torch import nn
@dataclass
class ModelArgs:
# default hyperparameters for the Llama 7B model
dim: int = 4096
n_layers: int = 32
n_heads: int = 32
n_kv_heads: Optional[int] = None
vocab_size: int = 32000
hidden_dim: Optional[int] = None
multiple_of: int = 256 # MLP hidden layer size will be multiple of
vocab_size: int = -1 # defined later by tokenizer
multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2
norm_eps: float = 1e-5
max_seq_len: int = 2048
dropout: float = 0.0
@@ -95,7 +93,6 @@ class Attention(nn.Module):
def __init__(self, args: ModelArgs):
super().__init__()
self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
assert args.n_heads % self.n_kv_heads == 0
model_parallel_size = 1
self.n_local_heads = args.n_heads // model_parallel_size
self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
@@ -167,10 +164,8 @@ class Attention(nn.Module):
class FeedForward(nn.Module):
def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
super().__init__()
if hidden_dim is None:
hidden_dim = 4 * dim
hidden_dim = int(2 * hidden_dim / 3)
hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
hidden_dim = int(2 * hidden_dim / 3)
hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
self.w1 = nn.Linear(dim, hidden_dim, bias=False)
self.w2 = nn.Linear(hidden_dim, dim, bias=False)
self.w3 = nn.Linear(dim, hidden_dim, bias=False)
@@ -189,7 +184,7 @@ class TransformerBlock(nn.Module):
self.attention = Attention(args)
self.feed_forward = FeedForward(
dim=args.dim,
hidden_dim=args.hidden_dim,
hidden_dim=4 * args.dim,
multiple_of=args.multiple_of,
dropout=args.dropout,
)
@@ -341,3 +336,55 @@ class Transformer(nn.Module):
idx = torch.cat((idx, idx_next), dim=1)
return idx
def export(self, filepath='model.bin'):
"""export the model weights in fp32 into .bin file to be read from C"""
f = open(filepath, 'wb')
def serialize(t):
d = t.detach().cpu().view(-1).numpy().astype(np.float32)
b = struct.pack(f'{len(d)}f', *d)
f.write(b)
# first write out the header
hidden_dim = self.layers[0].feed_forward.w1.weight.shape[0]
p = self.params
n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads
header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,
n_kv_heads, p.vocab_size, p.max_seq_len)
f.write(header)
# next write out the embedding weights
serialize(self.tok_embeddings.weight)
# now all the layers
# attention weights
for layer in self.layers:
serialize(layer.attention_norm.weight)
for layer in self.layers:
serialize(layer.attention.wq.weight)
for layer in self.layers:
serialize(layer.attention.wk.weight)
for layer in self.layers:
serialize(layer.attention.wv.weight)
for layer in self.layers:
serialize(layer.attention.wo.weight)
# ffn weights
for layer in self.layers:
serialize(layer.ffn_norm.weight)
for layer in self.layers:
serialize(layer.feed_forward.w1.weight)
for layer in self.layers:
serialize(layer.feed_forward.w2.weight)
for layer in self.layers:
serialize(layer.feed_forward.w3.weight)
# final rmsnorm
serialize(self.norm.weight)
# note: no need to write final classifier weights due to weight sharing
# freqs_cis
serialize(self.freqs_cos[:p.max_seq_len])
serialize(self.freqs_sin[:p.max_seq_len])
# write to binary file
f.close()
print(f"wrote {filepath}")
+1
View File
@@ -2,6 +2,7 @@ numpy==1.23.5
pytest==7.4.0
Requests==2.31.0
sentencepiece==0.1.99
tiktoken==0.3.3
torch==2.0.1
tqdm==4.64.1
wandb==0.15.5
+306 -577
View File
File diff suppressed because it is too large Load Diff
-21
View File
@@ -89,27 +89,6 @@
"cmd = f'./run {model_file} -t {temperature} -p {top_p} -n {max_token} -i \"{prompt}\"'\n",
"!{cmd}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#@title Run Meta's Llama 2 models\n",
"\n",
"#@markdown input your huggingface [access token](https://huggingface.co/settings/tokens) to download Meta's Llama 2 models.\n",
"\n",
"from huggingface_hub import snapshot_download\n",
"\n",
"token = \"replace your huggingface access token\" #@param {type:\"string\"}\n",
"path = snapshot_download(repo_id=\"meta-llama/Llama-2-7b\",cache_dir=\"Llama-2-7b\", use_auth_token=token)\n",
"\n",
"!python export_meta_llama_bin.py $path llama2_7b.bin\n",
"\n",
"print(\"./run llama2_7b.bin\\n\")\n",
"!./run llama2_7b.bin"
]
}
],
"metadata": {
+7 -17
View File
@@ -5,19 +5,17 @@ import os
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from model import ModelArgs, Transformer
from tokenizer import Tokenizer
from tinystories import get_tokenizer_model_path
# -----------------------------------------------------------------------------
checkpoint = 'out/ckpt.pt'
out_dir = 'out' # ignored if init_from is not 'resume'
start = "" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 1 # number of samples to draw
max_new_tokens = 100 # number of tokens generated in each sample
temperature = 1.0 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 300 # retain only the top_k most likely tokens, clamp others to have 0 probability
tokenizer = "" # override the tokenizer model path
seed = 1337
device = 'cuda' if torch.cuda.is_available() else 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
#dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
@@ -35,10 +33,11 @@ ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torc
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
# init from a model saved in a specific directory
checkpoint_dict = torch.load(checkpoint, map_location=device)
gptconf = ModelArgs(**checkpoint_dict['model_args'])
ckpt_path = os.path.join(out_dir, 'ckpt.pt')
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = ModelArgs(**checkpoint['model_args'])
model = Transformer(gptconf)
state_dict = checkpoint_dict['model']
state_dict = checkpoint['model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
if k.startswith(unwanted_prefix):
@@ -52,16 +51,7 @@ if compile:
model = torch.compile(model) # requires PyTorch 2.0 (optional)
# load the tokenizer
vocab_source = checkpoint_dict["config"].get("vocab_source", "llama2")
vocab_size = gptconf.vocab_size
if tokenizer:
# a specific tokenizer is provided, use it
tokenizer_model = tokenizer
else:
# let's try to find the tokenizer model automatically. bit gross here...
query_vocab_size = 0 if vocab_source == "llama2" else vocab_size
tokenizer_model = get_tokenizer_model_path(vocab_size=query_vocab_size)
enc = Tokenizer(tokenizer_model=tokenizer_model)
enc = Tokenizer()
# encode the beginning of the prompt
if start.startswith('FILE:'):
+66
View File
@@ -0,0 +1,66 @@
#!/usr/bin/env python
"""Saves the model as a TorchScript.
Usage examples:
./save_torchscript.py
./save_torchscript.py --dim=300
./save_torchscript.py --gzip_output=True --zero_params=True
The resulting file can be loaded in C++ code and then used for training or
inference with:
#include <torch/script.h>
torch::jit::Module module = torch::jit::load("model.pt")
Note that the serialized model includes the initial parameters and with the default
ModelArgs the file is 59M and gzips down to 55M. If you want to serialize/distribute
the model parameters separately you can zero out the parameters before saving it and
it will gzip down to 780K.
"""
import gzip
import os
import shutil
from inspect import signature
import torch
from model import ModelArgs, Transformer
# Model args config
dim = 288
n_layers = 6
n_heads = 6
n_kv_heads = n_heads
multiple_of = 32
max_seq_len = 256
dropout = 0.0
vocab_size = 32000
norm_eps = 1e-5
# Save config
model_path = "model.pt"
zero_params = False
gzip_output = False
# Allow config overrides
exec(open("configurator.py").read())
def main() -> None:
model_args = {k: globals()[k] for k in signature(ModelArgs).parameters}
model = Transformer(ModelArgs(**model_args))
# If requested zero params before saving the model. This is useful in
# conjunction with gzip_output.
if zero_params:
for p in model.parameters():
p.detach().zero_()
torch.jit.save(torch.jit.script(model), model_path)
if gzip_output:
with open(model_path, "rb") as f_in:
with gzip.open(f"{model_path}.gz", "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
os.unlink(model_path)
if __name__ == "__main__":
main()
-84
View File
@@ -1,84 +0,0 @@
#define TESTING
#include "run.c"
void assert_eq(int a, int b) {
if (a != b) {
printf("Assertion failed: %d != %d\n", a, b);
exit(EXIT_FAILURE);
}
}
void test_prompt_encoding(Tokenizer* tokenizer, char* prompt, int* expected_tokens, int num_expected_tokens) {
// encode
int* prompt_tokens = (int*)malloc((strlen(prompt)+3) * sizeof(int));
int num_prompt_tokens = 0; // the total number of prompt tokens
encode(tokenizer, prompt, 1, 0, prompt_tokens, &num_prompt_tokens);
#if VERBOSITY == 1
// print maybe
printf("expected tokens:\n");
for (int i = 0; i < num_expected_tokens; i++) printf("%d ", expected_tokens[i]);
printf("\n");
printf("actual tokens:\n");
for (int i = 0; i < num_prompt_tokens; i++) printf("%d ", prompt_tokens[i]);
printf("\n");
#endif
// verify
assert_eq(num_prompt_tokens, num_expected_tokens);
for (int i = 0; i < num_prompt_tokens; i++) {
assert_eq(prompt_tokens[i], expected_tokens[i]);
}
#if VERBOSITY == 1
printf("OK\n");
printf("---\n");
#endif
free(prompt_tokens);
}
void test_prompt_encodings() {
// let's verify that the Tokenizer works as expected
char *tokenizer_path = "tokenizer.bin";
int vocab_size = 32000;
Tokenizer tokenizer;
build_tokenizer(&tokenizer, tokenizer_path, vocab_size);
// test 0 (test the empty string) (I added this as a simple case)
char *prompt0 = "";
int expected_tokens0[] = {1};
test_prompt_encoding(&tokenizer, prompt0, expected_tokens0, sizeof(expected_tokens0) / sizeof(int));
// the tests below are taken from the Meta Llama 2 repo example code
// https://github.com/facebookresearch/llama/blob/main/example_text_completion.py
// and the expected tokens come from me breaking in the debugger in Python
// test 1
char *prompt = "I believe the meaning of life is";
int expected_tokens[] = {1, 306, 4658, 278, 6593, 310, 2834, 338};
test_prompt_encoding(&tokenizer, prompt, expected_tokens, sizeof(expected_tokens) / sizeof(int));
// test 2
char* prompt2 = "Simply put, the theory of relativity states that ";
int expected_tokens2[] = {1, 3439, 17632, 1925, 29892, 278, 6368, 310, 14215, 537, 5922, 393, 29871};
test_prompt_encoding(&tokenizer, prompt2, expected_tokens2, sizeof(expected_tokens2) / sizeof(int));
// test 3
char* prompt3 = "A brief message congratulating the team on the launch:\n\n Hi everyone,\n\n I just ";
int expected_tokens3[] = {1, 319, 11473, 2643, 378, 629, 271, 18099, 278, 3815, 373, 278, 6826, 29901, 13, 13, 4706, 6324, 14332, 29892, 13, 13, 4706, 306, 925, 29871};
test_prompt_encoding(&tokenizer, prompt3, expected_tokens3, sizeof(expected_tokens3) / sizeof(int));
// test 4
char* prompt4 = "Translate English to French:\n\n sea otter => loutre de mer\n peppermint => menthe poivrée\n plush girafe => girafe peluche\n cheese =>";
int expected_tokens4[] = {1, 4103, 9632, 4223, 304, 5176, 29901, 13, 13, 4706, 7205, 4932, 357, 1149, 301, 449, 276, 316, 2778, 13, 4706, 1236, 407, 837, 524, 1149, 6042, 354, 772, 440, 29878, 1318, 13, 4706, 715, 1878, 330, 3055, 1725, 1149, 330, 3055, 1725, 4639, 28754, 13, 4706, 923, 968, 1149};
test_prompt_encoding(&tokenizer, prompt4, expected_tokens4, sizeof(expected_tokens4) / sizeof(int));
// memory and file handles cleanup
free_tokenizer(&tokenizer);
}
int main(int argc, char *argv[]) {
test_prompt_encodings();
printf("ALL OK\n");
}
+28 -64
View File
@@ -4,71 +4,37 @@ $ pytest
"""
import os
import pytest # pip install pytest
import requests
import subprocess
import torch
from model import ModelArgs, Transformer
from tokenizer import Tokenizer
# -----------------------------------------------------------------------------
# test utilities
def test_argmax_inference():
"""
Only the simplest test for now: run inference with temperature 0
(for determinism) in both C and PyTorch, and see that the sampled tokens
are the same.
"""
test_ckpt_dir = "out" # TODO create a dummy test checkpoint for this?
test_ckpt_dir = "test"
# run C version
model_path = os.path.join(test_ckpt_dir, "model.bin")
command = ["./run", model_path, "0.0"]
proc = subprocess.Popen(command, stdout=subprocess.PIPE)
c_tokens = []
for line in proc.stdout:
token = int(line.decode('utf-8').strip())
c_tokens.append(token)
proc.wait()
#print(c_tokens)
def download_file(url, filename):
print(f"Downloading {url} to {filename}")
response = requests.get(url, stream=True)
response.raise_for_status() # Raise an HTTPError on bad status code
with open(filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
def attempt_download_files():
os.makedirs(test_ckpt_dir, exist_ok=True)
root_url = "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K"
need = ["stories260K.bin", "stories260K.pt", "tok512.bin", "tok512.model"]
for file in need:
url = root_url + '/' + file #os.path.join inserts \\ on windows
filename = os.path.join(test_ckpt_dir, file)
if not os.path.exists(filename):
download_file(url, filename)
expected_stdout = b'Once upon a time, there was a little girl named Lily. She loved to play outside in the park. One day, she saw a big, red ball. She wanted to play with it, but it was too high.\nLily\'s mom said, "Lily, let\'s go to the park." Lily was sad and didn\'t know what to do. She said, "I want to play with your ball, but I can\'t find it."\nLily was sad and didn\'t know what to do. She said, "I\'m sorry, Lily. I didn\'t know what to do."\nLily didn\'t want to help her mom, so she'
# -----------------------------------------------------------------------------
# actual tests
def test_runc():
""" Forwards a model against a known-good desired outcome in run.c for 200 steps"""
attempt_download_files()
model_path = os.path.join(test_ckpt_dir, "stories260K.bin")
tokenizer_path = os.path.join(test_ckpt_dir, "tok512.bin")
command = ["./run", model_path, "-z", tokenizer_path, "-t", "0.0", "-n", "200"]
with open('err.txt', mode='wb') as fe:
with open('stdout.txt', mode='wb') as fo:
proc = subprocess.Popen(command, stdout=fo, stderr=fe) #pipe in windows terminal does funny things like replacing \n with \r\n
proc.wait()
with open('stdout.txt', mode='r') as f:
stdout = f.read()
# strip the very last \n that is added by run.c for aesthetic reasons
stdout = stdout[:-1].encode('ascii')
assert stdout == expected_stdout
def test_python():
""" Forwards a model against a known-good desired outcome in sample.py for 200 steps"""
attempt_download_files()
device = "cpu" # stories260K is small enough to just breeze through it on CPU
checkpoint = os.path.join(test_ckpt_dir, "stories260K.pt")
checkpoint_dict = torch.load(checkpoint, map_location=device)
gptconf = ModelArgs(**checkpoint_dict['model_args'])
# run PyTorch version
device = "cuda" if torch.cuda.is_available() else "cpu"
ckpt_path = os.path.join(test_ckpt_dir, "ckpt.pt")
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = ModelArgs(**checkpoint['model_args'])
model = Transformer(gptconf)
state_dict = checkpoint_dict['model']
state_dict = checkpoint['model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
if k.startswith(unwanted_prefix):
@@ -78,12 +44,10 @@ def test_python():
model.to(device)
x = torch.tensor([[1]], dtype=torch.long, device=device) # 1 is BOS
with torch.inference_mode():
y = model.generate(x, max_new_tokens=200, temperature=0.0)
y = model.generate(x, max_new_tokens=gptconf.max_seq_len, temperature=0.0)
pt_tokens = y[0].tolist()
pt_tokens = pt_tokens[1:] # remove BOS
#print(pt_tokens)
tokenizer_model = os.path.join(test_ckpt_dir, "tok512.model")
enc = Tokenizer(tokenizer_model=tokenizer_model)
text = enc.decode(pt_tokens)
text = text.encode('ascii') # turn into bytes
assert text == expected_stdout
# compare
assert c_tokens == pt_tokens
+140
View File
@@ -0,0 +1,140 @@
"""
Download, preprocess and serve the TinyShakespeare dataset as a DataLoader.
Follows the same interface as the TinyStories dataset.
"""
import argparse
import os
import random
import numpy as np
import requests
import torch
import torch.distributed as dist
from tqdm import tqdm
from tokenizer import Tokenizer
DATA_CACHE_DIR = "data"
def download_file(url: str, fname: str, chunk_size=1024):
"""Helper function to download a file from a given url"""
resp = requests.get(url, stream=True)
total = int(resp.headers.get("content-length", 0))
with open(fname, "wb") as file, tqdm(
desc=fname,
total=total,
unit="iB",
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=chunk_size):
size = file.write(data)
bar.update(size)
def download():
"""Downloads the dataset to disk."""
os.makedirs(DATA_CACHE_DIR, exist_ok=True)
# download the TinyShakespeare dataset, unless it's already downloaded
data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
data_filename = os.path.join(DATA_CACHE_DIR, "tinyshakespeare.txt")
if not os.path.exists(data_filename):
print(f"Downloading {data_url} to {data_filename}...")
download_file(data_url, data_filename)
else:
print(f"{data_filename} already exists, skipping download...")
print("Download done.")
def pretokenize():
enc = Tokenizer()
data_file = os.path.join(DATA_CACHE_DIR, "tinyshakespeare.txt")
all_tokens = []
with open(data_file, "r") as f:
for line in f:
text = line.strip()
tokens = enc.encode(text, bos=True, eos=False)
all_tokens.extend(tokens)
all_tokens = np.array(all_tokens, dtype=np.uint16)
print(f"Total tokens: {len(all_tokens)}")
with open(data_file.replace(".txt", ".bin"), "wb") as f:
f.write(all_tokens.tobytes())
print(f"Saved {data_file.replace('.txt', '.bin')}")
print("Done.")
class PretokDataset(torch.utils.data.IterableDataset):
"""Loads pretokenized examples from disk and yields them as PyTorch tensors."""
def __init__(self, split, max_seq_len):
super().__init__()
self.split = split
self.max_seq_len = max_seq_len
def __iter__(self):
# get worker info within a DataLoader
worker_info = torch.utils.data.get_worker_info()
worker_id = worker_info.id if worker_info else 0
# get DDP rank info
rank = dist.get_rank() if dist.is_initialized() else 0
# combine the worker_id and worker_rank to create a unique seed for rng
seed = 42 + worker_id + 1337 * rank
rng = random.Random(seed)
print(f"Created a PretokDataset with rng seed {seed}")
data_file = os.path.join(DATA_CACHE_DIR, "tinyshakespeare.bin")
m_all = np.memmap(data_file, dtype=np.uint16, mode="r")
# split out 10% of the data for validation
split_ix = int(len(m_all) * 0.9)
if self.split == "train":
m = m_all[:split_ix]
else:
m = m_all[split_ix:]
num_batches = len(m) // self.max_seq_len
num_batches -= 1 # drop the last partial batch
assert num_batches > 0, "this split is way too small? investigate."
while True:
ixs = list(range(num_batches))
rng.shuffle(ixs)
for ix in ixs:
start = ix * self.max_seq_len
end = start + self.max_seq_len + 1
# calling .astype will copy the data into a new numpy array, now in RAM
chunk = torch.from_numpy((m[start:end]).astype(np.int64))
x = chunk[:-1]
y = chunk[1:]
yield x, y
class ShakespeareTask:
@staticmethod
def iter_batches(split, batch_size, max_seq_len, device, num_workers=0):
ds = PretokDataset(split, max_seq_len)
dl = torch.utils.data.DataLoader(
ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers
)
for x, y in dl:
x = x.to(device, non_blocking=True)
y = y.to(device, non_blocking=True)
yield x, y
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"])
args = parser.parse_args()
# depending on the stage call the appropriate function
fun = {
"download": download,
"pretokenize": pretokenize,
}
fun[args.stage]()
+20 -133
View File
@@ -9,11 +9,9 @@ import os
import random
from typing import List
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import requests
import sentencepiece as spm
import torch
import torch.distributed as dist
from tqdm import tqdm
@@ -39,7 +37,7 @@ def download_file(url: str, fname: str, chunk_size=1024):
def download():
"""Downloads the TinyStories dataset to DATA_CACHE_DIR"""
"""Downloads the dataset to disk."""
os.makedirs(DATA_CACHE_DIR, exist_ok=True)
# download the TinyStories dataset, unless it's already downloaded
@@ -68,66 +66,10 @@ def download():
print(f"Number of shards: {len(shard_filenames)}")
print(f"Example story:\n{data[0]}")
def train_vocab(vocab_size):
"""
Trains a custom sentencepiece tokenizer on the TinyStories dataset.
The custom tokenizer files will be saved in DATA_CACHE_DIR/tok{N} directories,
where N is the vocab size. This is also where the pretok .bin files will go.
"""
assert vocab_size > 0, "Vocab size must be positive"
# output file prefix path for sentencepiece
prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
# how many shards we'll use for vocab training, kept low for efficiency
num_shards = 10
# 1) export a large chunk of text as a single text file tiny.txt
tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt")
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
print(f"Writing temporary file {tiny_file} with {num_shards} shards...")
with open(tiny_file, "w") as of:
for shard in tqdm(shard_filenames[:num_shards]):
with open(shard, "r") as f:
data = json.load(f)
for example in data:
text = example["story"]
text = text.strip()
of.write(text + "\n")
print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")
# 2) train the sentencepiece model
print("Will now train the vocab...")
spm.SentencePieceTrainer.train(input=tiny_file,
model_prefix=prefix,
model_type="bpe",
vocab_size=vocab_size,
self_test_sample_size=0,
input_format="text",
character_coverage=1.0,
num_threads=os.cpu_count(),
split_digits=True,
allow_whitespace_only_pieces=True,
byte_fallback=True,
unk_surface=r" \342\201\207 ",
normalization_rule_name="identity")
# 3) optional cleanup, ask the user if they'd like to delete tiny.txt
dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")
if dec.lower() == "y":
os.remove(tiny_file)
print(f"Deleted {tiny_file}")
print(f"Trained tokenizer is in {prefix}.model")
print("Done.")
def process_shard(args, vocab_size):
def process_shard(args):
shard_id, shard = args
tokenizer_model = get_tokenizer_model_path(vocab_size)
enc = Tokenizer(tokenizer_model)
enc = Tokenizer()
with open(shard, "r") as f:
data = json.load(f)
all_tokens = []
@@ -138,49 +80,31 @@ def process_shard(args, vocab_size):
all_tokens.extend(tokens)
# convert to uint16 nparray
all_tokens = np.array(all_tokens, dtype=np.uint16)
# calculate the output filename
if vocab_size == 0:
# if we're using Llama 2, just save the tokenized file in the same dir
tokenized_filename = shard.replace(".json", ".bin")
else:
# save .bin files into a new tok{N} directory
bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
shard_basename = os.path.basename(shard)
bin_basename = shard_basename.replace(".json", ".bin")
tokenized_filename = os.path.join(bin_dir, bin_basename)
# write the bytes
# write to disk
tokenized_filename = shard.replace(".json", ".bin")
with open(tokenized_filename, "wb") as f:
f.write(all_tokens.tobytes())
# calculate the average sequence length (they are separated by BOS=1)
avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}")
print(f"Saved {tokenized_filename}")
def pretokenize(vocab_size):
def pretokenize():
# iterate the shards and tokenize all of them one by one
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
if vocab_size > 0:
# .bin files will be saved into tok{N} directory, create it once here
bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
os.makedirs(bin_dir, exist_ok=True)
# process all the shards in a process pool
fun = partial(process_shard, vocab_size=vocab_size)
with ProcessPoolExecutor() as executor:
executor.map(fun, enumerate(shard_filenames))
executor.map(process_shard, enumerate(shard_filenames))
print("Done.")
class PretokDataset(torch.utils.data.IterableDataset):
"""Loads pretokenized examples from disk and yields them as PyTorch tensors."""
def __init__(self, split, max_seq_len, vocab_size, vocab_source):
def __init__(self, split, max_seq_len):
super().__init__()
self.split = split
self.max_seq_len = max_seq_len
self.vocab_size = vocab_size
self.vocab_source = vocab_source
def __iter__(self):
# get worker info within a DataLoader
@@ -192,17 +116,10 @@ class PretokDataset(torch.utils.data.IterableDataset):
seed = 42 + worker_id + 1337 * rank
rng = random.Random(seed)
print(f"Created a PretokDataset with rng seed {seed}")
if self.vocab_source == "llama2":
# the .bin files are right along the .json files
bin_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
elif self.vocab_source == "custom":
# the .bin files are in tok{N} directory
bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{self.vocab_size}")
shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.bin")))
# train/test split. let's use only shard 0 for test split, rest train
shard_filenames = shard_filenames[1:] if self.split == "train" else shard_filenames[:1]
assert len(shard_filenames)>0, f"No bin files found in {bin_dir}"
while True:
rng.shuffle(shard_filenames)
for shard in shard_filenames:
@@ -222,25 +139,12 @@ class PretokDataset(torch.utils.data.IterableDataset):
y = chunk[1:]
yield x, y
# -----------------------------------------------------------------------------
# public interface functions
def get_tokenizer_model_path(vocab_size):
"""
Returns path to the sentencepiece tokenizer model for a given vocab size
vocab_size = 0 designates the default Llama 2 tokenizer, in that case
None is returned.
"""
if vocab_size == 0:
return None
else:
return os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model")
class Task:
@staticmethod
def iter_batches(batch_size, device, num_workers=0, **dataset_kwargs):
ds = PretokDataset(**dataset_kwargs)
def iter_batches(split, batch_size, max_seq_len, device, num_workers=0):
ds = PretokDataset(split, max_seq_len)
dl = torch.utils.data.DataLoader(
ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers
)
@@ -249,33 +153,16 @@ class Task:
y = y.to(device, non_blocking=True)
yield x, y
# -----------------------------------------------------------------------------
# CLI for constructing the dataset
if __name__ == "__main__":
"""
These stages are designed to be run in order.
To tokenize data with the Llama 2 tokenizer:
python tinystories.py download
python tinystories.py pretokenize
To tokenize data with a custom tokenizer we train ourselves with sentencepiece, e.g.:
python tinystories.py download
python tinystories.py train_vocab --vocab_size=2048
python tinystories.py pretokenize --vocab_size=2048
"""
parser = argparse.ArgumentParser()
parser.add_argument("stage", type=str, choices=["download", "pretokenize", "train_vocab"])
parser.add_argument("--vocab_size", type=int, default=0, help="pretokenization vocab size. 0 = use Llama 2 tokenizer.")
parser.add_argument("stage", type=str, choices=["download", "train_tokenizer", "pretokenize"])
args = parser.parse_args()
# depending on the stage call the appropriate function
if args.stage == "download":
download()
elif args.stage == "train_vocab":
train_vocab(vocab_size=args.vocab_size)
elif args.stage == "pretokenize":
pretokenize(vocab_size=args.vocab_size)
else:
raise ValueError(f"Unknown stage {args.stage}")
fun = {
"download": download,
"pretokenize": pretokenize,
}
fun[args.stage]()
BIN
View File
Binary file not shown.
+10 -13
View File
@@ -4,19 +4,20 @@
import os
import struct
import argparse
from logging import getLogger
from typing import List
from sentencepiece import SentencePieceProcessor
TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C
class Tokenizer:
def __init__(self, tokenizer_model=None):
model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
def __init__(self):
model_path = TOKENIZER_MODEL
assert os.path.isfile(model_path), model_path
self.sp_model = SentencePieceProcessor(model_file=model_path)
self.model_path = model_path
#print(f"Loaded SentencePiece model from {model_path}")
# BOS / EOS token IDs
self.n_words: int = self.sp_model.vocab_size()
@@ -51,28 +52,24 @@ class Tokenizer:
t = '\n<s>\n'
elif i == self.eos_id:
t = '\n</s>\n'
elif len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
t = chr(int(t[3:5], 16)) # e.g. make '<0x01>' into '\x01'
t = t.replace('', ' ') # sentencepiece uses this character as whitespace
b = t.encode('utf-8') # bytes of this token, utf-8 encoded
tokens.append(b)
scores.append(s)
# record the max token length
max_token_length = max(len(t) for t in tokens)
# write to a binary file
# the tokenizer.bin file is the same as .model file, but .bin
tokenizer_bin = self.model_path.replace('.model', '.bin')
with open(tokenizer_bin, 'wb') as f:
with open(TOKENIZER_BIN, 'wb') as f:
f.write(struct.pack("I", max_token_length))
for bytes, score in zip(tokens, scores):
f.write(struct.pack("fI", score, len(bytes)))
f.write(bytes)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-t", "--tokenizer-model", type=str, help="optional path to custom tokenizer ")
args = parser.parse_args()
t = Tokenizer(args.tokenizer_model)
t = Tokenizer()
t.export()
+10 -17
View File
@@ -29,7 +29,7 @@ from torch.distributed import destroy_process_group, init_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
from tinystories import Task
from export import model_export
from tinyshakespeare import ShakespeareTask
# -----------------------------------------------------------------------------
# I/O
@@ -47,13 +47,11 @@ wandb_run_name = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
# data
batch_size = 128 # if gradient_accumulation_steps > 1, this is the micro-batch size
max_seq_len = 256
vocab_source = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens
dataset = "tinystories" # tinystories|tinyshakespeare
# model
dim = 288
n_layers = 6
n_heads = 6
n_kv_heads = 6
multiple_of = 32
dropout = 0.0
# adamw optimizer
@@ -85,10 +83,6 @@ config = {k: globals()[k] for k in config_keys} # will be useful for logging
lr_decay_iters = max_iters # should be ~= max_iters per Chinchilla
min_lr = 0.0 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
# validating checks
assert vocab_source in ["llama2", "custom"]
assert vocab_source == "custom" or vocab_size == 32000, "The vocab from Meta has 32K tokens"
# various inits, derived attributes, I/O setup
ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run?
if ddp:
@@ -129,12 +123,11 @@ ctx = (
)
# task-specific setup
task = {'tinystories': Task, 'tinyshakespeare': ShakespeareTask}[dataset]
iter_batches = partial(
Task.iter_batches,
task.iter_batches,
batch_size=batch_size,
max_seq_len=max_seq_len,
vocab_size=vocab_size,
vocab_source=vocab_source,
device=device,
num_workers=0,
)
@@ -148,8 +141,8 @@ model_args = dict(
dim=dim,
n_layers=n_layers,
n_heads=n_heads,
n_kv_heads=n_kv_heads,
vocab_size=vocab_size,
n_kv_heads=n_heads,
vocab_size=32000,
multiple_of=multiple_of,
max_seq_len=max_seq_len,
dropout=dropout,
@@ -213,7 +206,7 @@ def estimate_loss():
out = {}
model.eval()
for split in ["train", "val"]:
batch_iter = iter_batches(split=split)
batch_iter = iter_batches(split)
losses = torch.zeros(eval_iters) # keep on CPU
for k in range(eval_iters):
X, Y = next(batch_iter)
@@ -245,7 +238,7 @@ if wandb_log and master_process:
wandb.init(project=wandb_project, name=wandb_run_name, config=config)
# training loop
train_batch_iter = iter_batches(split="train")
train_batch_iter = iter_batches("train")
X, Y = next(train_batch_iter) # fetch the very first batch
t0 = time.time()
local_iter_num = 0 # number of iterations in the lifetime of this process
@@ -271,7 +264,7 @@ while True:
"loss/val": losses["val"],
"lr": lr,
"mfu": running_mfu * 100, # convert to percentage
}, step = iter_num
}
)
except Exception as e:
print(f"logging to wandb failed: {e}")
@@ -288,7 +281,7 @@ while True:
}
print(f"saving checkpoint to {out_dir}")
torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt"))
model_export(raw_model, os.path.join(out_dir, "model.bin"), version=0)
raw_model.export(os.path.join(out_dir, "model.bin"))
if iter_num == 0 and eval_only:
break