From 9414e7a45eb1cd492b1f158598dc06e60146fdba Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 23 Jul 2023 14:52:08 +0000 Subject: [PATCH] tweaks and add a simple test --- README.md | 10 +++++++--- model.py | 22 +++++++++++----------- run.c | 30 ------------------------------ run_wrap.py | 2 +- sample.py | 2 -- test_all.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ train.py | 2 +- 7 files changed, 73 insertions(+), 48 deletions(-) create mode 100644 test_all.py diff --git a/README.md b/README.md index 866dfaf..bdfc849 100644 --- a/README.md +++ b/README.md @@ -54,13 +54,17 @@ python run_wrap.py I hope to delete this script soon though. Anyway, watch the tokens stream by, fun! -To verify correctness, we can also run the PyTorch inference script: +We can also run the PyTorch inference script for comparison: ```bash python sample.py ``` -Which gives the same results. I'd love to find some time to create actual tests, one day maybe. For now I just manually inspected activations and verified that they match, and that the samples are identical at temperature 0. If someone wishes to help me with tests I welcome PRs. +Which gives the same results. More detailed testing will be done in `test_all.py`, run as: + +```bash +$ pytest +``` ## unsorted todos @@ -70,7 +74,7 @@ Which gives the same results. I'd love to find some time to create actual tests, - todo support inferencing beyond max_seq_len steps, have to think through the kv cache - why is MFU so low (~20%) on my A100 40GB for training? - weird errors with torch.compile and wandb when using DDP -- make tests to decrease yolo +- make more better tests to decrease yolo ## License MIT diff --git a/model.py b/model.py index 8a310a8..d04fc76 100644 --- a/model.py +++ b/model.py @@ -288,19 +288,19 @@ class Transformer(nn.Module): idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:] # forward the model to get the logits for the index in the sequence logits, _ = self(idx_cond) - # pluck the logits at the final step and scale by desired temperature - logits = logits[:, -1, :] / temperature - # optionally crop the logits to only the top k options - if top_k is not None: - v, _ = torch.topk(logits, min(top_k, logits.size(-1))) - logits[logits < v[:, [-1]]] = -float('Inf') - # apply softmax to convert logits to (normalized) probabilities - probs = F.softmax(logits, dim=-1) + logits = logits[:, -1, :] # crop to just the final time step if temperature == 0.0: - # sample the most likely index - _, idx_next = torch.topk(probs, k=1, dim=-1) + # "sample" the single most likely index + _, idx_next = torch.topk(logits, k=1, dim=-1) else: - # sample from the distribution + # pluck the logits at the final step and scale by desired temperature + logits = logits / temperature + # optionally crop the logits to only the top k options + if top_k is not None: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + logits[logits < v[:, [-1]]] = -float('Inf') + # apply softmax to convert logits to (normalized) probabilities + probs = F.softmax(logits, dim=-1) idx_next = torch.multinomial(probs, num_samples=1) # append sampled index to the running sequence and continue idx = torch.cat((idx, idx_next), dim=1) diff --git a/run.c b/run.c index a0d1ea8..a94f9b3 100644 --- a/run.c +++ b/run.c @@ -148,36 +148,6 @@ void free_weights(TransformerWeights* w, Config* p) { // ---------------------------------------------------------------------------- // initialization: random init, or read from checkpoint -// initializes weights to random numbers from -.5 to .5 -void init_rand(float* w, int size) { - for (int i = 0; i < size; i++) { - w[i] = ((float)rand()/(float)(RAND_MAX)) - 0.5f; - } -} - -// constant init -void init_const(float* w, int size, float val) { - for (int i = 0; i < size; i++) { - w[i] = val; - } -} - -void random_init_weights(TransformerWeights* w, Config* p) { - init_rand(w->token_embedding_table, p->vocab_size * p->dim); - init_const(w->rms_att_weight, p->n_layers * p->dim, 1.0f); - init_const(w->rms_ffn_weight, p->n_layers * p->dim, 1.0f); - init_rand(w->wq, p->n_layers * p->dim * p->dim); - init_rand(w->wk, p->n_layers * p->dim * p->dim); - init_rand(w->wv, p->n_layers * p->dim * p->dim); - init_rand(w->wo, p->n_layers * p->dim * p->dim); - init_rand(w->w1, p->n_layers * p->dim * p->hidden_dim); - init_rand(w->w2, p->n_layers * p->hidden_dim * p->dim); - init_rand(w->w3, p->n_layers * p->dim * p->hidden_dim); - init_const(w->rms_final_weight, p->dim, 1.0f); - init_rand(w->freq_cis_real, p->seq_len * p->dim / 2); - init_rand(w->freq_cis_imag, p->seq_len * p->dim / 2); -} - void checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) { fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f); fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f); diff --git a/run_wrap.py b/run_wrap.py index e97bbd5..bd7dc3f 100644 --- a/run_wrap.py +++ b/run_wrap.py @@ -9,7 +9,7 @@ import subprocess import time # specify your command -command = ["./run", "model.bin", "0.0"] +command = ["./run", "model.bin"] # Start the process proc = subprocess.Popen(command, stdout=subprocess.PIPE) diff --git a/sample.py b/sample.py index 138c187..2038a63 100644 --- a/sample.py +++ b/sample.py @@ -44,8 +44,6 @@ for k,v in list(state_dict.items()): state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) model.load_state_dict(state_dict, strict=False) -model.export() # model.bin - model.eval() model.to(device) if compile: diff --git a/test_all.py b/test_all.py new file mode 100644 index 0000000..8563614 --- /dev/null +++ b/test_all.py @@ -0,0 +1,53 @@ +""" +Run simply with +$ pytest +""" +import os +import pytest # pip install pytest +import subprocess + +import torch +from model import ModelArgs, Transformer + +def test_argmax_inference(): + """ + Only the simplest test for now: run inference with temperature 0 + (for determinism) in both C and PyTorch, and see that the sampled tokens + are the same. + """ + test_ckpt_dir = "out" # TODO create a dummy test checkpoint for this? + + # run C version + model_path = os.path.join(test_ckpt_dir, "model.bin") + command = ["./run", model_path, "0.0"] + proc = subprocess.Popen(command, stdout=subprocess.PIPE) + c_tokens = [] + for line in proc.stdout: + token = int(line.decode('utf-8').strip()) + c_tokens.append(token) + proc.wait() + #print(c_tokens) + + # run PyTorch version + device = "cuda" if torch.cuda.is_available() else "cpu" + ckpt_path = os.path.join(test_ckpt_dir, "ckpt.pt") + checkpoint = torch.load(ckpt_path, map_location=device) + gptconf = ModelArgs(**checkpoint['model_args']) + model = Transformer(gptconf) + state_dict = checkpoint['model'] + unwanted_prefix = '_orig_mod.' + for k,v in list(state_dict.items()): + if k.startswith(unwanted_prefix): + state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) + model.load_state_dict(state_dict, strict=False) + model.eval() + model.to(device) + x = torch.tensor([[1]], dtype=torch.long, device=device) # 1 is BOS + with torch.inference_mode(): + y = model.generate(x, max_new_tokens=gptconf.max_seq_len, temperature=0.0) + pt_tokens = y[0].tolist() + pt_tokens = pt_tokens[1:] # remove BOS + #print(pt_tokens) + + # compare + assert c_tokens == pt_tokens diff --git a/train.py b/train.py index 9d9a098..7aa46c4 100644 --- a/train.py +++ b/train.py @@ -55,7 +55,7 @@ dropout = 0.0 # adamw optimizer gradient_accumulation_steps = 4 # used to simulate larger batch sizes learning_rate = 5e-4 # max learning rate -max_iters = 300000 # total number of training iterations +max_iters = 100000 # total number of training iterations weight_decay = 1e-1 beta1 = 0.9 beta2 = 0.95