tweaks and add a simple test

This commit is contained in:
Andrej Karpathy
2023-07-23 14:52:08 +00:00
parent f499d9d2b5
commit 9414e7a45e
7 changed files with 73 additions and 48 deletions
+7 -3
View File
@@ -54,13 +54,17 @@ python run_wrap.py
I hope to delete this script soon though. Anyway, watch the tokens stream by, fun!
To verify correctness, we can also run the PyTorch inference script:
We can also run the PyTorch inference script for comparison:
```bash
python sample.py
```
Which gives the same results. I'd love to find some time to create actual tests, one day maybe. For now I just manually inspected activations and verified that they match, and that the samples are identical at temperature 0. If someone wishes to help me with tests I welcome PRs.
Which gives the same results. More detailed testing will be done in `test_all.py`, run as:
```bash
$ pytest
```
## unsorted todos
@@ -70,7 +74,7 @@ Which gives the same results. I'd love to find some time to create actual tests,
- todo support inferencing beyond max_seq_len steps, have to think through the kv cache
- why is MFU so low (~20%) on my A100 40GB for training?
- weird errors with torch.compile and wandb when using DDP
- make tests to decrease yolo
- make more better tests to decrease yolo
## License
MIT
+11 -11
View File
@@ -288,19 +288,19 @@ class Transformer(nn.Module):
idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:]
# forward the model to get the logits for the index in the sequence
logits, _ = self(idx_cond)
# pluck the logits at the final step and scale by desired temperature
logits = logits[:, -1, :] / temperature
# optionally crop the logits to only the top k options
if top_k is not None:
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
logits[logits < v[:, [-1]]] = -float('Inf')
# apply softmax to convert logits to (normalized) probabilities
probs = F.softmax(logits, dim=-1)
logits = logits[:, -1, :] # crop to just the final time step
if temperature == 0.0:
# sample the most likely index
_, idx_next = torch.topk(probs, k=1, dim=-1)
# "sample" the single most likely index
_, idx_next = torch.topk(logits, k=1, dim=-1)
else:
# sample from the distribution
# pluck the logits at the final step and scale by desired temperature
logits = logits / temperature
# optionally crop the logits to only the top k options
if top_k is not None:
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
logits[logits < v[:, [-1]]] = -float('Inf')
# apply softmax to convert logits to (normalized) probabilities
probs = F.softmax(logits, dim=-1)
idx_next = torch.multinomial(probs, num_samples=1)
# append sampled index to the running sequence and continue
idx = torch.cat((idx, idx_next), dim=1)
-30
View File
@@ -148,36 +148,6 @@ void free_weights(TransformerWeights* w, Config* p) {
// ----------------------------------------------------------------------------
// initialization: random init, or read from checkpoint
// initializes weights to random numbers from -.5 to .5
void init_rand(float* w, int size) {
for (int i = 0; i < size; i++) {
w[i] = ((float)rand()/(float)(RAND_MAX)) - 0.5f;
}
}
// constant init
void init_const(float* w, int size, float val) {
for (int i = 0; i < size; i++) {
w[i] = val;
}
}
void random_init_weights(TransformerWeights* w, Config* p) {
init_rand(w->token_embedding_table, p->vocab_size * p->dim);
init_const(w->rms_att_weight, p->n_layers * p->dim, 1.0f);
init_const(w->rms_ffn_weight, p->n_layers * p->dim, 1.0f);
init_rand(w->wq, p->n_layers * p->dim * p->dim);
init_rand(w->wk, p->n_layers * p->dim * p->dim);
init_rand(w->wv, p->n_layers * p->dim * p->dim);
init_rand(w->wo, p->n_layers * p->dim * p->dim);
init_rand(w->w1, p->n_layers * p->dim * p->hidden_dim);
init_rand(w->w2, p->n_layers * p->hidden_dim * p->dim);
init_rand(w->w3, p->n_layers * p->dim * p->hidden_dim);
init_const(w->rms_final_weight, p->dim, 1.0f);
init_rand(w->freq_cis_real, p->seq_len * p->dim / 2);
init_rand(w->freq_cis_imag, p->seq_len * p->dim / 2);
}
void checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f);
fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f);
+1 -1
View File
@@ -9,7 +9,7 @@ import subprocess
import time
# specify your command
command = ["./run", "model.bin", "0.0"]
command = ["./run", "model.bin"]
# Start the process
proc = subprocess.Popen(command, stdout=subprocess.PIPE)
-2
View File
@@ -44,8 +44,6 @@ for k,v in list(state_dict.items()):
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict, strict=False)
model.export() # model.bin
model.eval()
model.to(device)
if compile:
+53
View File
@@ -0,0 +1,53 @@
"""
Run simply with
$ pytest
"""
import os
import pytest # pip install pytest
import subprocess
import torch
from model import ModelArgs, Transformer
def test_argmax_inference():
"""
Only the simplest test for now: run inference with temperature 0
(for determinism) in both C and PyTorch, and see that the sampled tokens
are the same.
"""
test_ckpt_dir = "out" # TODO create a dummy test checkpoint for this?
# run C version
model_path = os.path.join(test_ckpt_dir, "model.bin")
command = ["./run", model_path, "0.0"]
proc = subprocess.Popen(command, stdout=subprocess.PIPE)
c_tokens = []
for line in proc.stdout:
token = int(line.decode('utf-8').strip())
c_tokens.append(token)
proc.wait()
#print(c_tokens)
# run PyTorch version
device = "cuda" if torch.cuda.is_available() else "cpu"
ckpt_path = os.path.join(test_ckpt_dir, "ckpt.pt")
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = ModelArgs(**checkpoint['model_args'])
model = Transformer(gptconf)
state_dict = checkpoint['model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
if k.startswith(unwanted_prefix):
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict, strict=False)
model.eval()
model.to(device)
x = torch.tensor([[1]], dtype=torch.long, device=device) # 1 is BOS
with torch.inference_mode():
y = model.generate(x, max_new_tokens=gptconf.max_seq_len, temperature=0.0)
pt_tokens = y[0].tolist()
pt_tokens = pt_tokens[1:] # remove BOS
#print(pt_tokens)
# compare
assert c_tokens == pt_tokens
+1 -1
View File
@@ -55,7 +55,7 @@ dropout = 0.0
# adamw optimizer
gradient_accumulation_steps = 4 # used to simulate larger batch sizes
learning_rate = 5e-4 # max learning rate
max_iters = 300000 # total number of training iterations
max_iters = 100000 # total number of training iterations
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95