diff --git a/Makefile b/Makefile index 41f6ef9..360cd2f 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ rungnu: $(CC) -Ofast -std=gnu11 -o run run.c -lm .PHONY: runompgnu -rungnu: +runompgnu: $(CC) -Ofast -fopenmp -std=gnu11 run.c -lm -o run .PHONY: clean diff --git a/run.c b/run.c index eb344e2..68fabcd 100644 --- a/run.c +++ b/run.c @@ -193,6 +193,7 @@ void softmax(float* x, int size) { void matmul(float* xout, float* x, float* w, int n, int d) { // W (d,n) @ x (n,) -> xout (d,) + // by far the most amount of time is spent inside this little function int i; #pragma omp parallel for private(i) for (i = 0; i < d; i++) { @@ -205,7 +206,7 @@ void matmul(float* xout, float* x, float* w, int n, int d) { } void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights* w) { - + // a few convenience variables float *x = s->x; int dim = p->dim; @@ -222,7 +223,7 @@ void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights* // forward all the layers for(int l = 0; l < p->n_layers; l++) { - + // attention rmsnorm rmsnorm(s->xb, x, w->rms_att_weight + l*dim, dim); @@ -316,7 +317,7 @@ void transformer(int token, int pos, Config* p, RunState* s, TransformerWeights* for (int i = 0; i < hidden_dim; i++) { s->hb[i] = s->hb[i] * (1.0f / (1.0f + expf(-s->hb[i]))); } - + // elementwise multiply with w3(x) for (int i = 0; i < hidden_dim; i++) { s->hb[i] = s->hb[i] * s->hb2[i]; @@ -398,15 +399,12 @@ int main(int argc, char *argv[]) { // read in the model.bin file Config config; TransformerWeights weights; - int fd = 0; - float* data = NULL; - long file_size; + int fd = 0; // file descriptor for memory mapping + float* data = NULL; // memory mapped data pointer + long file_size; // size of the checkpoint file in bytes { FILE *file = fopen(checkpoint, "rb"); - if (!file) { - printf("Unable to open the checkpoint file %s!\n", checkpoint); - return 1; - } + if (!file) { printf("Couldn't open file %s\n", checkpoint); return 1; } // read in the config header if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; } // negative vocab size is hacky way of signaling unshared weights. bit yikes. @@ -431,11 +429,7 @@ int main(int argc, char *argv[]) { char** vocab = (char**)malloc(config.vocab_size * sizeof(char*)); { FILE *file = fopen("tokenizer.bin", "rb"); - if (!file) { - printf("Unable to open the tokenizer file tokenizer.bin! Run " - "python tokenizer.py to convert tokenizer.model -> tokenizer.bin\n"); - return 1; - } + if (!file) { printf("Couldn't load tokenizer.bin\n"); return 1; } int len; for (int i = 0; i < config.vocab_size; i++) { if(fread(&len, sizeof(int), 1, file) != 1) { return 1; } @@ -451,7 +445,7 @@ int main(int argc, char *argv[]) { malloc_run_state(&state, &config); // the current position we are in - long start = time_in_ms(); + long start = 0; // used to time our code, only initialized after first iteration int next; int token = 1; // 1 = BOS token in Llama-2 sentencepiece int pos = 0; @@ -479,11 +473,13 @@ int main(int argc, char *argv[]) { // advance forward token = next; pos++; + // init our timer here because the first iteration is slow due to memmap + if (start == 0) { start = time_in_ms(); } } // report achieved tok/s long end = time_in_ms(); - printf("\nachieved tok/s: %f\n", steps / (double)(end-start)*1000); + printf("\nachieved tok/s: %f\n", (steps-1) / (double)(end-start)*1000); // memory and file handles cleanup free_run_state(&state);