delete the run_wrap file! yay. ty @python273 and @ggerganov for code snippets

2023-07-24 04:02:57 +00:00
parent 44ecc784da
commit 3bfa5665d1
5 changed files with 60 additions and 52 deletions
@@ -26,14 +26,7 @@ gcc -O3 -o run run.c -lm
 ./run out/model.bin
 ```

-You'll notice that this just streams the raw tokens. (See [performance](#performance) for compile flags that can significantly speed this up). Unless you can read those directly, you'll want to translate them into text. For now sadly we have to run this C code through a simple wrapper that does the translation (see the file, it's just 30 lines):
-
-```bash
-pip install sentencepiece
-python run_wrap.py
-```
-
-You'll see text stream. On my M1 MacBook Air this runs at ~100 tokens/s, not bad for super naive fp32 single-threaded C code. Sample output:
+You'll see the text stream a sample. On my M1 MacBook Air this runs at ~100 tokens/s, not bad for super naive fp32 single-threaded C code. See [performance](#performance) for compile flags that can significantly speed this up. Sample output:

 *Once upon a time, there was a boy named Timmy. Timmy loved to play sports with his friends. He was very good at throwing and catching balls. One day, Timmy's mom gave him a new shirt to wear to a party. Timmy thought it was impressive and asked his mom to explain what a shirt could be for. "A shirt is like a special suit for a basketball game," his mom said. Timmy was happy to hear that and put on his new shirt. He felt like a soldier going to the army and shouting. From that day on, Timmy wore his new shirt every time he played sports with his friends at the party. Once upon a time, there was a little girl named Lily. She loved to play outside with her friends. One day, Lily and her friend Emma were playing with a ball. Emma threw the ball too hard and it hit Lily's face. Lily felt embarrassed and didn't want to play anymore.
 Emma asked Lily what was wrong, and Lily told her about her memory. Emma told Lily that she was embarrassed because she had thrown the ball too hard. Lily felt bad
@@ -74,12 +67,6 @@ You can now run it simply as
 ./run out/model.bin
 ```

-But note that this only emits the SentencePiece tokens. To decode the tokens into text too, run this script through a simple wrapper:
-
-```bash
-python run_wrap.py
-```
-
 Watch the tokens stream by, fun! We can also run the PyTorch inference script for comparison (to run, add [model.ckpt](https://drive.google.com/file/d/1SM0rMxzy7babB-v4MfTg1GFqOCgWar5w/view?usp=share_link) to /out if you haven't already):

 ```bash
@@ -124,8 +111,7 @@ Also, I saw someone report higher throughput replacing `gcc` with `clang`.

 ## unsorted todos

- why SentencePiece can't iteratively decode properly?
- would love to delete run_wrap.py and just directly use C code to string
+- why is there a leading space in C sampling code when we `./run`?
 - todo multiquery support? doesn't seem as useful for smaller models that run on CPU (?)
 - todo support inferencing beyond max_seq_len steps, have to think through the kv cache
 - why is MFU so low (~10%) on my A100 40GB for training?
@@ -378,7 +378,6 @@ int argmax(float* v, int n) {
 // ----------------------------------------------------------------------------

 int main(int argc, char *argv[]) {
-    setbuf(stdout, NULL); // disable stdout buffering

    // poor man's C argparse
    char *checkpoint = NULL;
@@ -412,6 +411,24 @@ int main(int argc, char *argv[]) {
    }
    fread(&config, sizeof(Config), 1, file);

+    // init the Tokenizer
+    char** vocab = (char**)malloc(config.vocab_size * sizeof(char*));
+    {
+        FILE *file = fopen("tokenizer.bin", "r");
+        if (!file) {
+            printf("Unable to open the tokenizer file tokenizer.bin! Run "
+            "python tokenizer.py to convert tokenizer.model -> tokenizer.bin\n");
+            return 1;
+        }
+        int len;
+        for (int i = 0; i < config.vocab_size; i++) {
+            fread(&len, sizeof(int), 1, file);
+            vocab[i] = (char *)malloc(len + 1);
+            fread(vocab[i], len, 1, file);
+            vocab[i][len] = '\0'; // add the string terminating token
+        }
+    }
+
    // create and init the Transformer
    TransformerWeights weights;
    malloc_weights(&weights, &config);
@@ -421,8 +438,9 @@ int main(int argc, char *argv[]) {
    // create and init the application RunState
    RunState state;
    malloc_run_state(&state, &config);
-
+    
    // the current position we are in
+    clock_t start = clock();
    int next;
    int token = 1; // 1 = BOS token in Llama-2 sentencepiece
    int pos = 0;
@@ -443,14 +461,24 @@ int main(int argc, char *argv[]) {
            // we now want to sample from this distribution to get the next token
            next = sample(state.logits, config.vocab_size);
        }
-        printf("%d\n", next);
+        printf("%s", vocab[next]);
+        fflush(stdout);

        // advance forward
        token = next;
        pos++;
    }
+    printf("\n");
+    
+    // report our achieved tok/s
+    clock_t end = clock();
+    double elapsed = (double)(end - start) / CLOCKS_PER_SEC;
+    printf("achieved tok/s: %f\n", config.seq_len / elapsed);

+    // memory cleanup
    free_run_state(&state);
    free_weights(&weights);
+    for (int i = 0; i < config.vocab_size; i++) { free(vocab[i]); }
+    free(vocab);
    return 0;
 }
@@ -1,33 +0,0 @@
-"""
-wrapper around run.c
-mostly deals with the sentencepiece encoding/decoding
-C code does all the transformer inference of the individual tokens
-"""
-
-from tokenizer import Tokenizer
-import subprocess
-import time
-
-# specify your command
-command = ["./run", "out/model.bin"]
-
-# Start the process
-proc = subprocess.Popen(command, stdout=subprocess.PIPE)
-enc = Tokenizer()
-
-t0 = time.time()
-tokens = []
-last = ''
-for line in proc.stdout:
-    token = int(line.decode('utf-8').strip())
-    dec = enc.decode(tokens + [token])
-    chunk = dec[len(last):]
-    print(chunk, end='',flush=True)
-    tokens.append(token)
-    last = dec
-t1 = time.time()
-# seeking help: how can we do streaming inference in sentencepiece properly?
-# or even delete sentencepiece entirely?
-
-print(f"\nachieved tok/s: {len(tokens) / (t1 - t0)}")
-proc.wait()
@@ -9,6 +9,7 @@ from typing import List
 from sentencepiece import SentencePieceProcessor

 TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
+TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C

 class Tokenizer:
    def __init__(self):
@@ -36,3 +37,29 @@ class Tokenizer:

    def decode(self, t: List[int]) -> str:
        return self.sp_model.decode(t)
+
+    def export(self):
+        tokens = []
+        for i in range(self.n_words):
+
+            # decode the token and light postprocessing
+            t = self.sp_model.id_to_piece(i)
+            if i == self.bos_id:
+                t = '\n<s>\n'
+            elif i == self.eos_id:
+                t = '\n</s>\n'
+            elif len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
+                t = chr(int(t[3:5], 16)) # e.g. make '<0x01>' into '\x01'
+            t = t.replace('▁', ' ') # sentencepiece uses this as the whitespace
+
+            tokens.append(t)
+        
+        with open(TOKENIZER_BIN, 'wb') as f:
+            for token in tokens:
+                bytes = token.encode('utf-8')
+                f.write((len(bytes)).to_bytes(4, 'little'))  # write length of bytes
+                f.write(bytes)  # write token bytes
+
+if __name__ == "__main__":
+    t = Tokenizer()
+    t.export()