tweaks and add a simple test

2023-07-23 14:52:08 +00:00
parent f499d9d2b5
commit 9414e7a45e
7 changed files with 73 additions and 48 deletions
@@ -54,13 +54,17 @@ python run_wrap.py

 I hope to delete this script soon though. Anyway, watch the tokens stream by, fun!

-To verify correctness, we can also run the PyTorch inference script:
+We can also run the PyTorch inference script for comparison:

 ```bash
 python sample.py
 ```

-Which gives the same results. I'd love to find some time to create actual tests, one day maybe. For now I just manually inspected activations and verified that they match, and that the samples are identical at temperature 0. If someone wishes to help me with tests I welcome PRs.
+Which gives the same results. More detailed testing will be done in `test_all.py`, run as:
+
+```bash
+$ pytest
+```

 ## unsorted todos

@@ -70,7 +74,7 @@ Which gives the same results. I'd love to find some time to create actual tests,
 - todo support inferencing beyond max_seq_len steps, have to think through the kv cache
 - why is MFU so low (~20%) on my A100 40GB for training?
 - weird errors with torch.compile and wandb when using DDP
- make tests to decrease yolo
+- make more better tests to decrease yolo

 ## License
 MIT
@@ -288,19 +288,19 @@ class Transformer(nn.Module):
            idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
-            # pluck the logits at the final step and scale by desired temperature
-            logits = logits[:, -1, :] / temperature
-            # optionally crop the logits to only the top k options
-            if top_k is not None:
-                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-                logits[logits < v[:, [-1]]] = -float('Inf')
-            # apply softmax to convert logits to (normalized) probabilities
-            probs = F.softmax(logits, dim=-1)
+            logits = logits[:, -1, :] # crop to just the final time step
            if temperature == 0.0:
-                # sample the most likely index
-                _, idx_next = torch.topk(probs, k=1, dim=-1)
+                # "sample" the single most likely index
+                _, idx_next = torch.topk(logits, k=1, dim=-1)
            else:
-                # sample from the distribution
+                # pluck the logits at the final step and scale by desired temperature
+                logits = logits / temperature
+                # optionally crop the logits to only the top k options
+                if top_k is not None:
+                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float('Inf')
+                # apply softmax to convert logits to (normalized) probabilities
+                probs = F.softmax(logits, dim=-1)
                idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)
@@ -148,36 +148,6 @@ void free_weights(TransformerWeights* w, Config* p) {
 // ----------------------------------------------------------------------------
 // initialization: random init, or read from checkpoint

-// initializes weights to random numbers from -.5 to .5
-void init_rand(float* w, int size) {
-    for (int i = 0; i < size; i++) {
-        w[i] = ((float)rand()/(float)(RAND_MAX)) - 0.5f;
-    }
-}
-
-// constant init
-void init_const(float* w, int size, float val) {
-    for (int i = 0; i < size; i++) {
-        w[i] = val;
-    }
-}
-
-void random_init_weights(TransformerWeights* w, Config* p) {
-    init_rand(w->token_embedding_table, p->vocab_size * p->dim);
-    init_const(w->rms_att_weight, p->n_layers * p->dim, 1.0f);
-    init_const(w->rms_ffn_weight, p->n_layers * p->dim, 1.0f);
-    init_rand(w->wq, p->n_layers * p->dim * p->dim);
-    init_rand(w->wk, p->n_layers * p->dim * p->dim);
-    init_rand(w->wv, p->n_layers * p->dim * p->dim);
-    init_rand(w->wo, p->n_layers * p->dim * p->dim);
-    init_rand(w->w1, p->n_layers * p->dim * p->hidden_dim);
-    init_rand(w->w2, p->n_layers * p->hidden_dim * p->dim);
-    init_rand(w->w3, p->n_layers * p->dim * p->hidden_dim);
-    init_const(w->rms_final_weight, p->dim, 1.0f);
-    init_rand(w->freq_cis_real, p->seq_len * p->dim / 2);
-    init_rand(w->freq_cis_imag, p->seq_len * p->dim / 2);
-}
-
 void checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
    fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f);
    fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f);
@@ -9,7 +9,7 @@ import subprocess
 import time

 # specify your command
-command = ["./run", "model.bin", "0.0"]
+command = ["./run", "model.bin"]

 # Start the process
 proc = subprocess.Popen(command, stdout=subprocess.PIPE)
@@ -44,8 +44,6 @@ for k,v in list(state_dict.items()):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
 model.load_state_dict(state_dict, strict=False)

-model.export() # model.bin
-
 model.eval()
 model.to(device)
 if compile:
@@ -0,0 +1,53 @@
+"""
+Run simply with
+$ pytest
+"""
+import os
+import pytest # pip install pytest
+import subprocess
+
+import torch
+from model import ModelArgs, Transformer
+
+def test_argmax_inference():
+    """
+    Only the simplest test for now: run inference with temperature 0 
+    (for determinism) in both C and PyTorch, and see that the sampled tokens 
+    are the same.
+    """
+    test_ckpt_dir = "out" # TODO create a dummy test checkpoint for this?
+
+    # run C version
+    model_path = os.path.join(test_ckpt_dir, "model.bin")
+    command = ["./run", model_path, "0.0"]
+    proc = subprocess.Popen(command, stdout=subprocess.PIPE)
+    c_tokens = []
+    for line in proc.stdout:
+        token = int(line.decode('utf-8').strip())
+        c_tokens.append(token)
+    proc.wait()
+    #print(c_tokens)
+
+    # run PyTorch version
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    ckpt_path = os.path.join(test_ckpt_dir, "ckpt.pt")
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    gptconf = ModelArgs(**checkpoint['model_args'])
+    model = Transformer(gptconf)
+    state_dict = checkpoint['model']
+    unwanted_prefix = '_orig_mod.'
+    for k,v in list(state_dict.items()):
+        if k.startswith(unwanted_prefix):
+            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
+    model.load_state_dict(state_dict, strict=False)
+    model.eval()
+    model.to(device)
+    x = torch.tensor([[1]], dtype=torch.long, device=device) # 1 is BOS
+    with torch.inference_mode():
+        y = model.generate(x, max_new_tokens=gptconf.max_seq_len, temperature=0.0)
+    pt_tokens = y[0].tolist()
+    pt_tokens = pt_tokens[1:] # remove BOS
+    #print(pt_tokens)
+
+    # compare
+    assert c_tokens == pt_tokens
@@ -55,7 +55,7 @@ dropout = 0.0
 # adamw optimizer
 gradient_accumulation_steps = 4  # used to simulate larger batch sizes
 learning_rate = 5e-4  # max learning rate
-max_iters = 300000  # total number of training iterations
+max_iters = 100000  # total number of training iterations
 weight_decay = 1e-1
 beta1 = 0.9
 beta2 = 0.95