From 9414e7a45eb1cd492b1f158598dc06e60146fdba Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 23 Jul 2023 14:52:08 +0000
Subject: [PATCH] tweaks and add a simple test

---
 README.md   | 10 +++++++---
 model.py    | 22 +++++++++++-----------
 run.c       | 30 ------------------------------
 run_wrap.py |  2 +-
 sample.py   |  2 --
 test_all.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 train.py    |  2 +-
 7 files changed, 73 insertions(+), 48 deletions(-)
 create mode 100644 test_all.py

diff --git a/README.md b/README.md
index 866dfaf..bdfc849 100644
--- a/README.md
+++ b/README.md
@@ -54,13 +54,17 @@ python run_wrap.py
 
 I hope to delete this script soon though. Anyway, watch the tokens stream by, fun!
 
-To verify correctness, we can also run the PyTorch inference script:
+We can also run the PyTorch inference script for comparison:
 
 ```bash
 python sample.py
 ```
 
-Which gives the same results. I'd love to find some time to create actual tests, one day maybe. For now I just manually inspected activations and verified that they match, and that the samples are identical at temperature 0. If someone wishes to help me with tests I welcome PRs.
+Which gives the same results. More detailed testing will be done in `test_all.py`, run as:
+
+```bash
+$ pytest
+```
 
 ## unsorted todos
 
@@ -70,7 +74,7 @@ Which gives the same results. I'd love to find some time to create actual tests,
 - todo support inferencing beyond max_seq_len steps, have to think through the kv cache
 - why is MFU so low (~20%) on my A100 40GB for training?
 - weird errors with torch.compile and wandb when using DDP
-- make tests to decrease yolo
+- make more better tests to decrease yolo
 
 ## License
 MIT
diff --git a/model.py b/model.py
index 8a310a8..d04fc76 100644
--- a/model.py
+++ b/model.py
@@ -288,19 +288,19 @@ class Transformer(nn.Module):
             idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:]
             # forward the model to get the logits for the index in the sequence
             logits, _ = self(idx_cond)
-            # pluck the logits at the final step and scale by desired temperature
-            logits = logits[:, -1, :] / temperature
-            # optionally crop the logits to only the top k options
-            if top_k is not None:
-                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-                logits[logits < v[:, [-1]]] = -float('Inf')
-            # apply softmax to convert logits to (normalized) probabilities
-            probs = F.softmax(logits, dim=-1)
+            logits = logits[:, -1, :] # crop to just the final time step
             if temperature == 0.0:
-                # sample the most likely index
-                _, idx_next = torch.topk(probs, k=1, dim=-1)
+                # "sample" the single most likely index
+                _, idx_next = torch.topk(logits, k=1, dim=-1)
             else:
-                # sample from the distribution
+                # pluck the logits at the final step and scale by desired temperature
+                logits = logits / temperature
+                # optionally crop the logits to only the top k options
+                if top_k is not None:
+                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float('Inf')
+                # apply softmax to convert logits to (normalized) probabilities
+                probs = F.softmax(logits, dim=-1)
                 idx_next = torch.multinomial(probs, num_samples=1)
             # append sampled index to the running sequence and continue
             idx = torch.cat((idx, idx_next), dim=1)
diff --git a/run.c b/run.c
index a0d1ea8..a94f9b3 100644
--- a/run.c
+++ b/run.c
@@ -148,36 +148,6 @@ void free_weights(TransformerWeights* w, Config* p) {
 // ----------------------------------------------------------------------------
 // initialization: random init, or read from checkpoint
 
-// initializes weights to random numbers from -.5 to .5
-void init_rand(float* w, int size) {
-    for (int i = 0; i < size; i++) {
-        w[i] = ((float)rand()/(float)(RAND_MAX)) - 0.5f;
-    }
-}
-
-// constant init
-void init_const(float* w, int size, float val) {
-    for (int i = 0; i < size; i++) {
-        w[i] = val;
-    }
-}
-
-void random_init_weights(TransformerWeights* w, Config* p) {
-    init_rand(w->token_embedding_table, p->vocab_size * p->dim);
-    init_const(w->rms_att_weight, p->n_layers * p->dim, 1.0f);
-    init_const(w->rms_ffn_weight, p->n_layers * p->dim, 1.0f);
-    init_rand(w->wq, p->n_layers * p->dim * p->dim);
-    init_rand(w->wk, p->n_layers * p->dim * p->dim);
-    init_rand(w->wv, p->n_layers * p->dim * p->dim);
-    init_rand(w->wo, p->n_layers * p->dim * p->dim);
-    init_rand(w->w1, p->n_layers * p->dim * p->hidden_dim);
-    init_rand(w->w2, p->n_layers * p->hidden_dim * p->dim);
-    init_rand(w->w3, p->n_layers * p->dim * p->hidden_dim);
-    init_const(w->rms_final_weight, p->dim, 1.0f);
-    init_rand(w->freq_cis_real, p->seq_len * p->dim / 2);
-    init_rand(w->freq_cis_imag, p->seq_len * p->dim / 2);
-}
-
 void checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
     fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f);
     fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f);
diff --git a/run_wrap.py b/run_wrap.py
index e97bbd5..bd7dc3f 100644
--- a/run_wrap.py
+++ b/run_wrap.py
@@ -9,7 +9,7 @@ import subprocess
 import time
 
 # specify your command
-command = ["./run", "model.bin", "0.0"]
+command = ["./run", "model.bin"]
 
 # Start the process
 proc = subprocess.Popen(command, stdout=subprocess.PIPE)
diff --git a/sample.py b/sample.py
index 138c187..2038a63 100644
--- a/sample.py
+++ b/sample.py
@@ -44,8 +44,6 @@ for k,v in list(state_dict.items()):
         state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
 model.load_state_dict(state_dict, strict=False)
 
-model.export() # model.bin
-
 model.eval()
 model.to(device)
 if compile:
diff --git a/test_all.py b/test_all.py
new file mode 100644
index 0000000..8563614
--- /dev/null
+++ b/test_all.py
@@ -0,0 +1,53 @@
+"""
+Run simply with
+$ pytest
+"""
+import os
+import pytest # pip install pytest
+import subprocess
+
+import torch
+from model import ModelArgs, Transformer
+
+def test_argmax_inference():
+    """
+    Only the simplest test for now: run inference with temperature 0 
+    (for determinism) in both C and PyTorch, and see that the sampled tokens 
+    are the same.
+    """
+    test_ckpt_dir = "out" # TODO create a dummy test checkpoint for this?
+
+    # run C version
+    model_path = os.path.join(test_ckpt_dir, "model.bin")
+    command = ["./run", model_path, "0.0"]
+    proc = subprocess.Popen(command, stdout=subprocess.PIPE)
+    c_tokens = []
+    for line in proc.stdout:
+        token = int(line.decode('utf-8').strip())
+        c_tokens.append(token)
+    proc.wait()
+    #print(c_tokens)
+
+    # run PyTorch version
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    ckpt_path = os.path.join(test_ckpt_dir, "ckpt.pt")
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    gptconf = ModelArgs(**checkpoint['model_args'])
+    model = Transformer(gptconf)
+    state_dict = checkpoint['model']
+    unwanted_prefix = '_orig_mod.'
+    for k,v in list(state_dict.items()):
+        if k.startswith(unwanted_prefix):
+            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
+    model.load_state_dict(state_dict, strict=False)
+    model.eval()
+    model.to(device)
+    x = torch.tensor([[1]], dtype=torch.long, device=device) # 1 is BOS
+    with torch.inference_mode():
+        y = model.generate(x, max_new_tokens=gptconf.max_seq_len, temperature=0.0)
+    pt_tokens = y[0].tolist()
+    pt_tokens = pt_tokens[1:] # remove BOS
+    #print(pt_tokens)
+
+    # compare
+    assert c_tokens == pt_tokens
diff --git a/train.py b/train.py
index 9d9a098..7aa46c4 100644
--- a/train.py
+++ b/train.py
@@ -55,7 +55,7 @@ dropout = 0.0
 # adamw optimizer
 gradient_accumulation_steps = 4  # used to simulate larger batch sizes
 learning_rate = 5e-4  # max learning rate
-max_iters = 300000  # total number of training iterations
+max_iters = 100000  # total number of training iterations
 weight_decay = 1e-1
 beta1 = 0.9
 beta2 = 0.95