turned on trimTrailingWhitespace in my vscode sorry about that
This commit is contained in:
@@ -61,7 +61,7 @@ def apply_rotary_emb(
|
|||||||
# reshape xq and xk to match the complex representation
|
# reshape xq and xk to match the complex representation
|
||||||
xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1)
|
xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1)
|
||||||
xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1)
|
xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1)
|
||||||
|
|
||||||
# reshape freqs_cos and freqs_sin for broadcasting
|
# reshape freqs_cos and freqs_sin for broadcasting
|
||||||
freqs_cos = reshape_for_broadcast(freqs_cos, xq_r)
|
freqs_cos = reshape_for_broadcast(freqs_cos, xq_r)
|
||||||
freqs_sin = reshape_for_broadcast(freqs_sin, xq_r)
|
freqs_sin = reshape_for_broadcast(freqs_sin, xq_r)
|
||||||
@@ -154,7 +154,7 @@ class Attention(nn.Module):
|
|||||||
|
|
||||||
# restore time as batch dimension and concat heads
|
# restore time as batch dimension and concat heads
|
||||||
output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
|
output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
|
||||||
|
|
||||||
# final projection into the residual stream
|
# final projection into the residual stream
|
||||||
output = self.wo(output)
|
output = self.wo(output)
|
||||||
output = self.resid_dropout(output)
|
output = self.resid_dropout(output)
|
||||||
@@ -170,7 +170,7 @@ class FeedForward(nn.Module):
|
|||||||
self.w2 = nn.Linear(hidden_dim, dim, bias=False)
|
self.w2 = nn.Linear(hidden_dim, dim, bias=False)
|
||||||
self.w3 = nn.Linear(dim, hidden_dim, bias=False)
|
self.w3 = nn.Linear(dim, hidden_dim, bias=False)
|
||||||
self.dropout = nn.Dropout(dropout)
|
self.dropout = nn.Dropout(dropout)
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
|
return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
|
||||||
|
|
||||||
@@ -222,7 +222,7 @@ class Transformer(nn.Module):
|
|||||||
freqs_cos, freqs_sin = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)
|
freqs_cos, freqs_sin = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)
|
||||||
self.register_buffer("freqs_cos", freqs_cos, persistent=False)
|
self.register_buffer("freqs_cos", freqs_cos, persistent=False)
|
||||||
self.register_buffer("freqs_sin", freqs_sin, persistent=False)
|
self.register_buffer("freqs_sin", freqs_sin, persistent=False)
|
||||||
|
|
||||||
# init all weights
|
# init all weights
|
||||||
self.apply(self._init_weights)
|
self.apply(self._init_weights)
|
||||||
# apply special scaled init to the residual projections, per GPT-2 paper
|
# apply special scaled init to the residual projections, per GPT-2 paper
|
||||||
@@ -304,7 +304,7 @@ class Transformer(nn.Module):
|
|||||||
flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
|
flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
|
||||||
mfu = flops_achieved / flops_promised
|
mfu = flops_achieved / flops_promised
|
||||||
return mfu
|
return mfu
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
|
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
|
||||||
"""
|
"""
|
||||||
@@ -334,7 +334,7 @@ class Transformer(nn.Module):
|
|||||||
idx_next = torch.multinomial(probs, num_samples=1)
|
idx_next = torch.multinomial(probs, num_samples=1)
|
||||||
# append sampled index to the running sequence and continue
|
# append sampled index to the running sequence and continue
|
||||||
idx = torch.cat((idx, idx_next), dim=1)
|
idx = torch.cat((idx, idx_next), dim=1)
|
||||||
|
|
||||||
return idx
|
return idx
|
||||||
|
|
||||||
def export(self, filepath='model.bin'):
|
def export(self, filepath='model.bin'):
|
||||||
@@ -350,13 +350,13 @@ class Transformer(nn.Module):
|
|||||||
hidden_dim = self.layers[0].feed_forward.w1.weight.shape[0]
|
hidden_dim = self.layers[0].feed_forward.w1.weight.shape[0]
|
||||||
p = self.params
|
p = self.params
|
||||||
n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads
|
n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads
|
||||||
header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,
|
header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,
|
||||||
n_kv_heads, p.vocab_size, p.max_seq_len)
|
n_kv_heads, p.vocab_size, p.max_seq_len)
|
||||||
f.write(header)
|
f.write(header)
|
||||||
|
|
||||||
# next write out the embedding weights
|
# next write out the embedding weights
|
||||||
serialize(self.tok_embeddings.weight)
|
serialize(self.tok_embeddings.weight)
|
||||||
|
|
||||||
# now all the layers
|
# now all the layers
|
||||||
# attention weights
|
# attention weights
|
||||||
for layer in self.layers:
|
for layer in self.layers:
|
||||||
|
|||||||
@@ -89,8 +89,8 @@ void malloc_run_state(RunState* s, Config* p) {
|
|||||||
s->key_cache = calloc(p->n_layers * p->seq_len * p->dim, sizeof(float));
|
s->key_cache = calloc(p->n_layers * p->seq_len * p->dim, sizeof(float));
|
||||||
s->value_cache = calloc(p->n_layers * p->seq_len * p->dim, sizeof(float));
|
s->value_cache = calloc(p->n_layers * p->seq_len * p->dim, sizeof(float));
|
||||||
// ensure all mallocs went fine
|
// ensure all mallocs went fine
|
||||||
if (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q
|
if (!s->x || !s->xb || !s->xb2 || !s->hb || !s->hb2 || !s->q
|
||||||
|| !s->k || !s->v || !s->att || !s->logits || !s->key_cache
|
|| !s->k || !s->v || !s->att || !s->logits || !s->key_cache
|
||||||
|| !s->value_cache) {
|
|| !s->value_cache) {
|
||||||
printf("malloc failed!\n");
|
printf("malloc failed!\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
@@ -252,7 +252,7 @@ void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights*
|
|||||||
float* value_cache_row = s->value_cache + loff + pos * dim;
|
float* value_cache_row = s->value_cache + loff + pos * dim;
|
||||||
memcpy(key_cache_row, s->k, dim*sizeof(*key_cache_row));
|
memcpy(key_cache_row, s->k, dim*sizeof(*key_cache_row));
|
||||||
memcpy(value_cache_row, s->v, dim*sizeof(*value_cache_row));
|
memcpy(value_cache_row, s->v, dim*sizeof(*value_cache_row));
|
||||||
|
|
||||||
// multihead attention. iterate over all heads
|
// multihead attention. iterate over all heads
|
||||||
int h;
|
int h;
|
||||||
#pragma omp parallel for private(h)
|
#pragma omp parallel for private(h)
|
||||||
@@ -306,7 +306,7 @@ void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights*
|
|||||||
// first calculate self.w1(x) and self.w3(x)
|
// first calculate self.w1(x) and self.w3(x)
|
||||||
matmul(s->hb, s->xb, w->w1 + l*dim*hidden_dim, dim, hidden_dim);
|
matmul(s->hb, s->xb, w->w1 + l*dim*hidden_dim, dim, hidden_dim);
|
||||||
matmul(s->hb2, s->xb, w->w3 + l*dim*hidden_dim, dim, hidden_dim);
|
matmul(s->hb2, s->xb, w->w3 + l*dim*hidden_dim, dim, hidden_dim);
|
||||||
|
|
||||||
// F.silu; silu(x)=x*σ(x),where σ(x) is the logistic sigmoid
|
// F.silu; silu(x)=x*σ(x),where σ(x) is the logistic sigmoid
|
||||||
for (int i = 0; i < hidden_dim; i++) {
|
for (int i = 0; i < hidden_dim; i++) {
|
||||||
s->hb[i] = s->hb[i] * (1.0f / (1.0f + expf(-s->hb[i])));
|
s->hb[i] = s->hb[i] * (1.0f / (1.0f + expf(-s->hb[i])));
|
||||||
@@ -323,7 +323,7 @@ void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights*
|
|||||||
// residual connection
|
// residual connection
|
||||||
accum(x, s->xb, dim);
|
accum(x, s->xb, dim);
|
||||||
}
|
}
|
||||||
|
|
||||||
// final rmsnorm
|
// final rmsnorm
|
||||||
rmsnorm(x, x, w->rms_final_weight, dim);
|
rmsnorm(x, x, w->rms_final_weight, dim);
|
||||||
|
|
||||||
@@ -345,7 +345,7 @@ int str_lookup(char *str, char **vocab, int vocab_size) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, unsigned int max_token_length, int *tokens, int *n_tokens) {
|
void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, unsigned int max_token_length, int *tokens, int *n_tokens) {
|
||||||
|
|
||||||
// a temporary buffer to merge two consecutive tokens
|
// a temporary buffer to merge two consecutive tokens
|
||||||
char* str_buffer = malloc((max_token_length*2+1) * sizeof(char)); // *2 for concat, +1 for null terminator
|
char* str_buffer = malloc((max_token_length*2+1) * sizeof(char)); // *2 for concat, +1 for null terminator
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user