Files
llama2.c/run_wrap.py
T
2023-07-23 05:23:45 +00:00

35 lines
863 B
Python

"""
wrapper around run.c
mostly deals with the sentencepiece encoding/decoding
C code does all the transformer inference of the individual tokens
"""
from tokenizer import Tokenizer
import subprocess
import time
# specify your command
command = ["./run", "model.bin", "0.0"]
# Start the process
proc = subprocess.Popen(command, stdout=subprocess.PIPE)
enc = Tokenizer()
t0 = time.time()
tokens = []
for line in proc.stdout:
token = int(line.decode('utf-8').strip())
dec = enc.decode([token])
print(dec, end=" ", flush=True)
tokens.append(token)
t1 = time.time()
print('\n---\n')
print("Sorry I'm not sure why sentencepiece can't stream tokens properly, I'll solve it later. Here is the whole thing:")
print('\n---\n')
print(enc.decode(tokens))
print(f"achieved tok/s: {len(tokens) / (t1 - t0)}")
# Wait for the process to finish
proc.wait()