somewhere ~20 hours later

2023-07-23 05:23:45 +00:00
parent 731657856e
commit 5b161abb9a
11 changed files with 1616 additions and 0 deletions
@@ -0,0 +1,34 @@
+"""
+wrapper around run.c
+mostly deals with the sentencepiece encoding/decoding
+C code does all the transformer inference of the individual tokens
+"""
+
+from tokenizer import Tokenizer
+import subprocess
+import time
+
+# specify your command
+command = ["./run", "model.bin", "0.0"]
+
+# Start the process
+proc = subprocess.Popen(command, stdout=subprocess.PIPE)
+enc = Tokenizer()
+
+t0 = time.time()
+tokens = []
+for line in proc.stdout:
+    token = int(line.decode('utf-8').strip())
+    dec = enc.decode([token])
+    print(dec, end=" ", flush=True)
+    tokens.append(token)
+t1 = time.time()
+
+print('\n---\n')
+print("Sorry I'm not sure why sentencepiece can't stream tokens properly, I'll solve it later. Here is the whole thing:")
+print('\n---\n')
+print(enc.decode(tokens))
+
+print(f"achieved tok/s: {len(tokens) / (t1 - t0)}")
+# Wait for the process to finish
+proc.wait()