delete the run_wrap file! yay. ty @python273 and @ggerganov for code snippets

This commit is contained in:
Andrej Karpathy
2023-07-24 04:02:57 +00:00
parent 44ecc784da
commit 3bfa5665d1
5 changed files with 60 additions and 52 deletions
+2 -16
View File
@@ -26,14 +26,7 @@ gcc -O3 -o run run.c -lm
./run out/model.bin
```
You'll notice that this just streams the raw tokens. (See [performance](#performance) for compile flags that can significantly speed this up). Unless you can read those directly, you'll want to translate them into text. For now sadly we have to run this C code through a simple wrapper that does the translation (see the file, it's just 30 lines):
```bash
pip install sentencepiece
python run_wrap.py
```
You'll see text stream. On my M1 MacBook Air this runs at ~100 tokens/s, not bad for super naive fp32 single-threaded C code. Sample output:
You'll see the text stream a sample. On my M1 MacBook Air this runs at ~100 tokens/s, not bad for super naive fp32 single-threaded C code. See [performance](#performance) for compile flags that can significantly speed this up. Sample output:
*Once upon a time, there was a boy named Timmy. Timmy loved to play sports with his friends. He was very good at throwing and catching balls. One day, Timmy's mom gave him a new shirt to wear to a party. Timmy thought it was impressive and asked his mom to explain what a shirt could be for. "A shirt is like a special suit for a basketball game," his mom said. Timmy was happy to hear that and put on his new shirt. He felt like a soldier going to the army and shouting. From that day on, Timmy wore his new shirt every time he played sports with his friends at the party. Once upon a time, there was a little girl named Lily. She loved to play outside with her friends. One day, Lily and her friend Emma were playing with a ball. Emma threw the ball too hard and it hit Lily's face. Lily felt embarrassed and didn't want to play anymore.
Emma asked Lily what was wrong, and Lily told her about her memory. Emma told Lily that she was embarrassed because she had thrown the ball too hard. Lily felt bad
@@ -74,12 +67,6 @@ You can now run it simply as
./run out/model.bin
```
But note that this only emits the SentencePiece tokens. To decode the tokens into text too, run this script through a simple wrapper:
```bash
python run_wrap.py
```
Watch the tokens stream by, fun! We can also run the PyTorch inference script for comparison (to run, add [model.ckpt](https://drive.google.com/file/d/1SM0rMxzy7babB-v4MfTg1GFqOCgWar5w/view?usp=share_link) to /out if you haven't already):
```bash
@@ -124,8 +111,7 @@ Also, I saw someone report higher throughput replacing `gcc` with `clang`.
## unsorted todos
- why SentencePiece can't iteratively decode properly?
- would love to delete run_wrap.py and just directly use C code to string
- why is there a leading space in C sampling code when we `./run`?
- todo multiquery support? doesn't seem as useful for smaller models that run on CPU (?)
- todo support inferencing beyond max_seq_len steps, have to think through the kv cache
- why is MFU so low (~10%) on my A100 40GB for training?
+31 -3
View File
@@ -378,7 +378,6 @@ int argmax(float* v, int n) {
// ----------------------------------------------------------------------------
int main(int argc, char *argv[]) {
setbuf(stdout, NULL); // disable stdout buffering
// poor man's C argparse
char *checkpoint = NULL;
@@ -412,6 +411,24 @@ int main(int argc, char *argv[]) {
}
fread(&config, sizeof(Config), 1, file);
// init the Tokenizer
char** vocab = (char**)malloc(config.vocab_size * sizeof(char*));
{
FILE *file = fopen("tokenizer.bin", "r");
if (!file) {
printf("Unable to open the tokenizer file tokenizer.bin! Run "
"python tokenizer.py to convert tokenizer.model -> tokenizer.bin\n");
return 1;
}
int len;
for (int i = 0; i < config.vocab_size; i++) {
fread(&len, sizeof(int), 1, file);
vocab[i] = (char *)malloc(len + 1);
fread(vocab[i], len, 1, file);
vocab[i][len] = '\0'; // add the string terminating token
}
}
// create and init the Transformer
TransformerWeights weights;
malloc_weights(&weights, &config);
@@ -421,8 +438,9 @@ int main(int argc, char *argv[]) {
// create and init the application RunState
RunState state;
malloc_run_state(&state, &config);
// the current position we are in
clock_t start = clock();
int next;
int token = 1; // 1 = BOS token in Llama-2 sentencepiece
int pos = 0;
@@ -443,14 +461,24 @@ int main(int argc, char *argv[]) {
// we now want to sample from this distribution to get the next token
next = sample(state.logits, config.vocab_size);
}
printf("%d\n", next);
printf("%s", vocab[next]);
fflush(stdout);
// advance forward
token = next;
pos++;
}
printf("\n");
// report our achieved tok/s
clock_t end = clock();
double elapsed = (double)(end - start) / CLOCKS_PER_SEC;
printf("achieved tok/s: %f\n", config.seq_len / elapsed);
// memory cleanup
free_run_state(&state);
free_weights(&weights);
for (int i = 0; i < config.vocab_size; i++) { free(vocab[i]); }
free(vocab);
return 0;
}
-33
View File
@@ -1,33 +0,0 @@
"""
wrapper around run.c
mostly deals with the sentencepiece encoding/decoding
C code does all the transformer inference of the individual tokens
"""
from tokenizer import Tokenizer
import subprocess
import time
# specify your command
command = ["./run", "out/model.bin"]
# Start the process
proc = subprocess.Popen(command, stdout=subprocess.PIPE)
enc = Tokenizer()
t0 = time.time()
tokens = []
last = ''
for line in proc.stdout:
token = int(line.decode('utf-8').strip())
dec = enc.decode(tokens + [token])
chunk = dec[len(last):]
print(chunk, end='',flush=True)
tokens.append(token)
last = dec
t1 = time.time()
# seeking help: how can we do streaming inference in sentencepiece properly?
# or even delete sentencepiece entirely?
print(f"\nachieved tok/s: {len(tokens) / (t1 - t0)}")
proc.wait()
BIN
View File
Binary file not shown.
+27
View File
@@ -9,6 +9,7 @@ from typing import List
from sentencepiece import SentencePieceProcessor
TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C
class Tokenizer:
def __init__(self):
@@ -36,3 +37,29 @@ class Tokenizer:
def decode(self, t: List[int]) -> str:
return self.sp_model.decode(t)
def export(self):
tokens = []
for i in range(self.n_words):
# decode the token and light postprocessing
t = self.sp_model.id_to_piece(i)
if i == self.bos_id:
t = '\n<s>\n'
elif i == self.eos_id:
t = '\n</s>\n'
elif len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
t = chr(int(t[3:5], 16)) # e.g. make '<0x01>' into '\x01'
t = t.replace('', ' ') # sentencepiece uses this as the whitespace
tokens.append(t)
with open(TOKENIZER_BIN, 'wb') as f:
for token in tokens:
bytes = token.encode('utf-8')
f.write((len(bytes)).to_bytes(4, 'little')) # write length of bytes
f.write(bytes) # write token bytes
if __name__ == "__main__":
t = Tokenizer()
t.export()