From fe9b9f2f15eda96507837f1b2584e98401a61930 Mon Sep 17 00:00:00 2001 From: Jani Monoses Date: Wed, 23 Aug 2023 17:28:14 +0300 Subject: [PATCH 1/6] Train vocab in Python --- README.md | 2 +- tinystories.py | 26 ++++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index e9df1f6..f4e20e9 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,7 @@ python tinystories.py train_vocab --vocab_size=4096 python tinystories.py pretokenize --vocab_size=4096 ``` -The `train_vocab` stage will call the `train_vocab.sh` script, which calls the `sentencepiece` library to train the tokenizer, storing it in a new file `data/tok4096.model`. I tried to reproduce as well as I could the settings that (I think) Meta used to train their vocabulary. This uses the Byte Pair Encoding algorithm that starts out with raw utf8 byte sequences of the text data and then iteratively merges the most common consecutive pairs of tokens to form the vocabulary. Inspect the `tinystories.py` file - the custom tokenizers are stored in a special directory structure indexed by the vocab size. +The `train_vocab` stage will call the `sentencepiece` library to train the tokenizer, storing it in a new file `data/tok4096.model`. I tried to reproduce as well as I could the settings that (I think) Meta used to train their vocabulary. This uses the Byte Pair Encoding algorithm that starts out with raw utf8 byte sequences of the text data and then iteratively merges the most common consecutive pairs of tokens to form the vocabulary. Inspect the `tinystories.py` file - the custom tokenizers are stored in a special directory structure indexed by the vocab size. A quick note of interest is that vocab size of 4096 trained specifically on tinystories creates integer sequences with about the same sequence length per example as the default Llama 2 tokenizer of 32000 tokens! This means that our custom, tailored tokenizer is a lot better adapted to our specific text, and can compress it very effectively. So our trained models are smaller and faster. diff --git a/tinystories.py b/tinystories.py index 90d576b..003b1e3 100644 --- a/tinystories.py +++ b/tinystories.py @@ -13,6 +13,7 @@ from functools import partial import numpy as np import requests +import sentencepiece as spm import torch import torch.distributed as dist from tqdm import tqdm @@ -97,16 +98,21 @@ def train_vocab(vocab_size): of.write(text + "\n") print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB") - # 2) run the train_vocab.sh script that trains the sentencepiece model - print("Will now train the vocab with:") - cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}" - print(cmd) - print("OK? [y/N] ") - dec = input() - if dec.lower() != "y": - print("Exiting...") - return - os.system(cmd) + # 2) train the sentencepiece model + print("Will now train the vocab...") + + spm.SentencePieceTrainer.train(input=tiny_file, + model_prefix=prefix, + model_type="bpe", + vocab_size=vocab_size, + self_test_sample_size=0, + input_format="text", + character_coverage=1.0, + split_digits=True, + allow_whitespace_only_pieces=True, + byte_fallback=True, + unk_surface=r" \342\201\207 ", + normalization_rule_name="identity") # 3) optional cleanup, ask the user if they'd like to delete tiny.txt dec = input(f"Delete the temporary file {tiny_file}? [y/N] ") From 9bc72acab041f471b655a763206a24513e8313d9 Mon Sep 17 00:00:00 2001 From: Ali Nehzat Date: Thu, 24 Aug 2023 09:09:16 +1000 Subject: [PATCH 2/6] steps shouldn't exceed the model's seq_len either --- run.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.c b/run.c index da1fbc4..0eaa655 100644 --- a/run.c +++ b/run.c @@ -839,7 +839,7 @@ int main(int argc, char *argv[]) { // build the Transformer via the model .bin file Transformer transformer; build_transformer(&transformer, checkpoint_path); - if (steps == 0) steps = transformer.config.seq_len; // ovrerride to ~max length + if (steps == 0 || steps > transformer.config.seq_len) steps = transformer.config.seq_len; // ovrerride to ~max length // build the Tokenizer via the tokenizer .bin file Tokenizer tokenizer; From 096325b66c2ab84095bd407cbab84d731edc65bc Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 24 Aug 2023 03:09:55 +0000 Subject: [PATCH 3/6] bring back num_threads --- tinystories.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tinystories.py b/tinystories.py index 003b1e3..800d73a 100644 --- a/tinystories.py +++ b/tinystories.py @@ -100,7 +100,6 @@ def train_vocab(vocab_size): # 2) train the sentencepiece model print("Will now train the vocab...") - spm.SentencePieceTrainer.train(input=tiny_file, model_prefix=prefix, model_type="bpe", @@ -108,6 +107,7 @@ def train_vocab(vocab_size): self_test_sample_size=0, input_format="text", character_coverage=1.0, + num_threads=os.cpu_count(), split_digits=True, allow_whitespace_only_pieces=True, byte_fallback=True, From d7cd98633dcc50c9e58f4b39b105fe9f9494cf85 Mon Sep 17 00:00:00 2001 From: Andrej Date: Thu, 24 Aug 2023 09:04:52 -0700 Subject: [PATCH 4/6] add todo item to add a PyTorch Engine --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f4e20e9..1652b4c 100644 --- a/README.md +++ b/README.md @@ -327,9 +327,10 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg - support Llama 2 7B Chat models with a Chat UI/UX in run.c, very similar to llama.cpp - ability to calculate perplexity in run.c, exactly as done in llama.cpp - add support in run.c of reading version 1+ files from export, later deprecate "version 0" -- add more tests inside [test.c](test.c) (call for help!) +- add more tests in [test.c](test.c) - runq.c (int8 quantization) add - run.cu (CUDA) investigate and merge +- add an Engine class that serves the model ~efficiently but in PyTorch (see [Issue 346](https://github.com/karpathy/llama2.c/issues/346)) - make it easier to add a new dataset with not too much pain - (LoRA) finetuning and export of Llama 2 models From 19cfbeca71f8d65d3b19a38e4dec9f4e37a731cf Mon Sep 17 00:00:00 2001 From: Diego Marcos Segura Date: Thu, 24 Aug 2023 19:45:23 -0700 Subject: [PATCH 5/6] Fix typo in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1652b4c..d08f611 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Train the Llama 2 LLM architecture in PyTorch then inference it with one simple As the architecture is identical, you can also load and inference Meta's Llama 2 models. However, the current code only inferences models in fp32, so you will most likely not be able to productively load models larger than 7B. Work on model quantization is currently ongoing. -Please note that this repo started recently as a fun weekend project: I took my earlier [nanoGPT](https://github.com/karpathy/nanoGPT), tuned it to implement the Llama-2 architecture instead of GPT-2, and the meat of it was writing the C inference engine in [run.c](run.c). So the project is young and moving quickly. Hat tip to the awesome [llama.cpp](https://github.com/ggerganov/llama.cpp) for inspiring this project. Compred to llama.cpp, I wanted something super simple, minimal, and educational so I chose to hard-code the Llama 2 architecture and just roll one inference file of pure C with no dependencies. +Please note that this repo started recently as a fun weekend project: I took my earlier [nanoGPT](https://github.com/karpathy/nanoGPT), tuned it to implement the Llama-2 architecture instead of GPT-2, and the meat of it was writing the C inference engine in [run.c](run.c). So the project is young and moving quickly. Hat tip to the awesome [llama.cpp](https://github.com/ggerganov/llama.cpp) for inspiring this project. Compared to llama.cpp, I wanted something super simple, minimal, and educational so I chose to hard-code the Llama 2 architecture and just roll one inference file of pure C with no dependencies. ## feel the magic From 6def77d4baef39b9878643001f473e49af405c7c Mon Sep 17 00:00:00 2001 From: Markus Zhang <45726499+photomz@users.noreply.github.com> Date: Fri, 25 Aug 2023 17:12:29 +0800 Subject: [PATCH 6/6] Correct WandB log step --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index e958538..e8321d8 100644 --- a/train.py +++ b/train.py @@ -271,7 +271,7 @@ while True: "loss/val": losses["val"], "lr": lr, "mfu": running_mfu * 100, # convert to percentage - } + }, step = iter_num ) except Exception as e: print(f"logging to wandb failed: {e}")