tweaks and add a simple test
This commit is contained in:
@@ -54,13 +54,17 @@ python run_wrap.py
|
||||
|
||||
I hope to delete this script soon though. Anyway, watch the tokens stream by, fun!
|
||||
|
||||
To verify correctness, we can also run the PyTorch inference script:
|
||||
We can also run the PyTorch inference script for comparison:
|
||||
|
||||
```bash
|
||||
python sample.py
|
||||
```
|
||||
|
||||
Which gives the same results. I'd love to find some time to create actual tests, one day maybe. For now I just manually inspected activations and verified that they match, and that the samples are identical at temperature 0. If someone wishes to help me with tests I welcome PRs.
|
||||
Which gives the same results. More detailed testing will be done in `test_all.py`, run as:
|
||||
|
||||
```bash
|
||||
$ pytest
|
||||
```
|
||||
|
||||
## unsorted todos
|
||||
|
||||
@@ -70,7 +74,7 @@ Which gives the same results. I'd love to find some time to create actual tests,
|
||||
- todo support inferencing beyond max_seq_len steps, have to think through the kv cache
|
||||
- why is MFU so low (~20%) on my A100 40GB for training?
|
||||
- weird errors with torch.compile and wandb when using DDP
|
||||
- make tests to decrease yolo
|
||||
- make more better tests to decrease yolo
|
||||
|
||||
## License
|
||||
MIT
|
||||
|
||||
@@ -288,19 +288,19 @@ class Transformer(nn.Module):
|
||||
idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:]
|
||||
# forward the model to get the logits for the index in the sequence
|
||||
logits, _ = self(idx_cond)
|
||||
# pluck the logits at the final step and scale by desired temperature
|
||||
logits = logits[:, -1, :] / temperature
|
||||
# optionally crop the logits to only the top k options
|
||||
if top_k is not None:
|
||||
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
||||
logits[logits < v[:, [-1]]] = -float('Inf')
|
||||
# apply softmax to convert logits to (normalized) probabilities
|
||||
probs = F.softmax(logits, dim=-1)
|
||||
logits = logits[:, -1, :] # crop to just the final time step
|
||||
if temperature == 0.0:
|
||||
# sample the most likely index
|
||||
_, idx_next = torch.topk(probs, k=1, dim=-1)
|
||||
# "sample" the single most likely index
|
||||
_, idx_next = torch.topk(logits, k=1, dim=-1)
|
||||
else:
|
||||
# sample from the distribution
|
||||
# pluck the logits at the final step and scale by desired temperature
|
||||
logits = logits / temperature
|
||||
# optionally crop the logits to only the top k options
|
||||
if top_k is not None:
|
||||
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
||||
logits[logits < v[:, [-1]]] = -float('Inf')
|
||||
# apply softmax to convert logits to (normalized) probabilities
|
||||
probs = F.softmax(logits, dim=-1)
|
||||
idx_next = torch.multinomial(probs, num_samples=1)
|
||||
# append sampled index to the running sequence and continue
|
||||
idx = torch.cat((idx, idx_next), dim=1)
|
||||
|
||||
@@ -148,36 +148,6 @@ void free_weights(TransformerWeights* w, Config* p) {
|
||||
// ----------------------------------------------------------------------------
|
||||
// initialization: random init, or read from checkpoint
|
||||
|
||||
// initializes weights to random numbers from -.5 to .5
|
||||
void init_rand(float* w, int size) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
w[i] = ((float)rand()/(float)(RAND_MAX)) - 0.5f;
|
||||
}
|
||||
}
|
||||
|
||||
// constant init
|
||||
void init_const(float* w, int size, float val) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
w[i] = val;
|
||||
}
|
||||
}
|
||||
|
||||
void random_init_weights(TransformerWeights* w, Config* p) {
|
||||
init_rand(w->token_embedding_table, p->vocab_size * p->dim);
|
||||
init_const(w->rms_att_weight, p->n_layers * p->dim, 1.0f);
|
||||
init_const(w->rms_ffn_weight, p->n_layers * p->dim, 1.0f);
|
||||
init_rand(w->wq, p->n_layers * p->dim * p->dim);
|
||||
init_rand(w->wk, p->n_layers * p->dim * p->dim);
|
||||
init_rand(w->wv, p->n_layers * p->dim * p->dim);
|
||||
init_rand(w->wo, p->n_layers * p->dim * p->dim);
|
||||
init_rand(w->w1, p->n_layers * p->dim * p->hidden_dim);
|
||||
init_rand(w->w2, p->n_layers * p->hidden_dim * p->dim);
|
||||
init_rand(w->w3, p->n_layers * p->dim * p->hidden_dim);
|
||||
init_const(w->rms_final_weight, p->dim, 1.0f);
|
||||
init_rand(w->freq_cis_real, p->seq_len * p->dim / 2);
|
||||
init_rand(w->freq_cis_imag, p->seq_len * p->dim / 2);
|
||||
}
|
||||
|
||||
void checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
|
||||
fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f);
|
||||
fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f);
|
||||
|
||||
+1
-1
@@ -9,7 +9,7 @@ import subprocess
|
||||
import time
|
||||
|
||||
# specify your command
|
||||
command = ["./run", "model.bin", "0.0"]
|
||||
command = ["./run", "model.bin"]
|
||||
|
||||
# Start the process
|
||||
proc = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
|
||||
@@ -44,8 +44,6 @@ for k,v in list(state_dict.items()):
|
||||
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
|
||||
model.export() # model.bin
|
||||
|
||||
model.eval()
|
||||
model.to(device)
|
||||
if compile:
|
||||
|
||||
+53
@@ -0,0 +1,53 @@
|
||||
"""
|
||||
Run simply with
|
||||
$ pytest
|
||||
"""
|
||||
import os
|
||||
import pytest # pip install pytest
|
||||
import subprocess
|
||||
|
||||
import torch
|
||||
from model import ModelArgs, Transformer
|
||||
|
||||
def test_argmax_inference():
|
||||
"""
|
||||
Only the simplest test for now: run inference with temperature 0
|
||||
(for determinism) in both C and PyTorch, and see that the sampled tokens
|
||||
are the same.
|
||||
"""
|
||||
test_ckpt_dir = "out" # TODO create a dummy test checkpoint for this?
|
||||
|
||||
# run C version
|
||||
model_path = os.path.join(test_ckpt_dir, "model.bin")
|
||||
command = ["./run", model_path, "0.0"]
|
||||
proc = subprocess.Popen(command, stdout=subprocess.PIPE)
|
||||
c_tokens = []
|
||||
for line in proc.stdout:
|
||||
token = int(line.decode('utf-8').strip())
|
||||
c_tokens.append(token)
|
||||
proc.wait()
|
||||
#print(c_tokens)
|
||||
|
||||
# run PyTorch version
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
ckpt_path = os.path.join(test_ckpt_dir, "ckpt.pt")
|
||||
checkpoint = torch.load(ckpt_path, map_location=device)
|
||||
gptconf = ModelArgs(**checkpoint['model_args'])
|
||||
model = Transformer(gptconf)
|
||||
state_dict = checkpoint['model']
|
||||
unwanted_prefix = '_orig_mod.'
|
||||
for k,v in list(state_dict.items()):
|
||||
if k.startswith(unwanted_prefix):
|
||||
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
model.eval()
|
||||
model.to(device)
|
||||
x = torch.tensor([[1]], dtype=torch.long, device=device) # 1 is BOS
|
||||
with torch.inference_mode():
|
||||
y = model.generate(x, max_new_tokens=gptconf.max_seq_len, temperature=0.0)
|
||||
pt_tokens = y[0].tolist()
|
||||
pt_tokens = pt_tokens[1:] # remove BOS
|
||||
#print(pt_tokens)
|
||||
|
||||
# compare
|
||||
assert c_tokens == pt_tokens
|
||||
@@ -55,7 +55,7 @@ dropout = 0.0
|
||||
# adamw optimizer
|
||||
gradient_accumulation_steps = 4 # used to simulate larger batch sizes
|
||||
learning_rate = 5e-4 # max learning rate
|
||||
max_iters = 300000 # total number of training iterations
|
||||
max_iters = 100000 # total number of training iterations
|
||||
weight_decay = 1e-1
|
||||
beta1 = 0.9
|
||||
beta2 = 0.95
|
||||
|
||||
Reference in New Issue
Block a user