small stylistic fixes and adjustments, fix bug in Makefile, and change the timing code to skip the first (slow) iteration
This commit is contained in:
@@ -42,7 +42,7 @@ rungnu:
|
|||||||
$(CC) -Ofast -std=gnu11 -o run run.c -lm
|
$(CC) -Ofast -std=gnu11 -o run run.c -lm
|
||||||
|
|
||||||
.PHONY: runompgnu
|
.PHONY: runompgnu
|
||||||
rungnu:
|
runompgnu:
|
||||||
$(CC) -Ofast -fopenmp -std=gnu11 run.c -lm -o run
|
$(CC) -Ofast -fopenmp -std=gnu11 run.c -lm -o run
|
||||||
|
|
||||||
.PHONY: clean
|
.PHONY: clean
|
||||||
|
|||||||
@@ -193,6 +193,7 @@ void softmax(float* x, int size) {
|
|||||||
|
|
||||||
void matmul(float* xout, float* x, float* w, int n, int d) {
|
void matmul(float* xout, float* x, float* w, int n, int d) {
|
||||||
// W (d,n) @ x (n,) -> xout (d,)
|
// W (d,n) @ x (n,) -> xout (d,)
|
||||||
|
// by far the most amount of time is spent inside this little function
|
||||||
int i;
|
int i;
|
||||||
#pragma omp parallel for private(i)
|
#pragma omp parallel for private(i)
|
||||||
for (i = 0; i < d; i++) {
|
for (i = 0; i < d; i++) {
|
||||||
@@ -398,15 +399,12 @@ int main(int argc, char *argv[]) {
|
|||||||
// read in the model.bin file
|
// read in the model.bin file
|
||||||
Config config;
|
Config config;
|
||||||
TransformerWeights weights;
|
TransformerWeights weights;
|
||||||
int fd = 0;
|
int fd = 0; // file descriptor for memory mapping
|
||||||
float* data = NULL;
|
float* data = NULL; // memory mapped data pointer
|
||||||
long file_size;
|
long file_size; // size of the checkpoint file in bytes
|
||||||
{
|
{
|
||||||
FILE *file = fopen(checkpoint, "rb");
|
FILE *file = fopen(checkpoint, "rb");
|
||||||
if (!file) {
|
if (!file) { printf("Couldn't open file %s\n", checkpoint); return 1; }
|
||||||
printf("Unable to open the checkpoint file %s!\n", checkpoint);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
// read in the config header
|
// read in the config header
|
||||||
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
|
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
|
||||||
// negative vocab size is hacky way of signaling unshared weights. bit yikes.
|
// negative vocab size is hacky way of signaling unshared weights. bit yikes.
|
||||||
@@ -431,11 +429,7 @@ int main(int argc, char *argv[]) {
|
|||||||
char** vocab = (char**)malloc(config.vocab_size * sizeof(char*));
|
char** vocab = (char**)malloc(config.vocab_size * sizeof(char*));
|
||||||
{
|
{
|
||||||
FILE *file = fopen("tokenizer.bin", "rb");
|
FILE *file = fopen("tokenizer.bin", "rb");
|
||||||
if (!file) {
|
if (!file) { printf("Couldn't load tokenizer.bin\n"); return 1; }
|
||||||
printf("Unable to open the tokenizer file tokenizer.bin! Run "
|
|
||||||
"python tokenizer.py to convert tokenizer.model -> tokenizer.bin\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
int len;
|
int len;
|
||||||
for (int i = 0; i < config.vocab_size; i++) {
|
for (int i = 0; i < config.vocab_size; i++) {
|
||||||
if(fread(&len, sizeof(int), 1, file) != 1) { return 1; }
|
if(fread(&len, sizeof(int), 1, file) != 1) { return 1; }
|
||||||
@@ -451,7 +445,7 @@ int main(int argc, char *argv[]) {
|
|||||||
malloc_run_state(&state, &config);
|
malloc_run_state(&state, &config);
|
||||||
|
|
||||||
// the current position we are in
|
// the current position we are in
|
||||||
long start = time_in_ms();
|
long start = 0; // used to time our code, only initialized after first iteration
|
||||||
int next;
|
int next;
|
||||||
int token = 1; // 1 = BOS token in Llama-2 sentencepiece
|
int token = 1; // 1 = BOS token in Llama-2 sentencepiece
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
@@ -479,11 +473,13 @@ int main(int argc, char *argv[]) {
|
|||||||
// advance forward
|
// advance forward
|
||||||
token = next;
|
token = next;
|
||||||
pos++;
|
pos++;
|
||||||
|
// init our timer here because the first iteration is slow due to memmap
|
||||||
|
if (start == 0) { start = time_in_ms(); }
|
||||||
}
|
}
|
||||||
|
|
||||||
// report achieved tok/s
|
// report achieved tok/s
|
||||||
long end = time_in_ms();
|
long end = time_in_ms();
|
||||||
printf("\nachieved tok/s: %f\n", steps / (double)(end-start)*1000);
|
printf("\nachieved tok/s: %f\n", (steps-1) / (double)(end-start)*1000);
|
||||||
|
|
||||||
// memory and file handles cleanup
|
// memory and file handles cleanup
|
||||||
free_run_state(&state);
|
free_run_state(&state);
|
||||||
|
|||||||
Reference in New Issue
Block a user