delete the run_wrap file! yay. ty @python273 and @ggerganov for code snippets

This commit is contained in:
Andrej Karpathy
2023-07-24 04:02:57 +00:00
parent 44ecc784da
commit 3bfa5665d1
5 changed files with 60 additions and 52 deletions
+27
View File
@@ -9,6 +9,7 @@ from typing import List
from sentencepiece import SentencePieceProcessor
TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C
class Tokenizer:
def __init__(self):
@@ -36,3 +37,29 @@ class Tokenizer:
def decode(self, t: List[int]) -> str:
return self.sp_model.decode(t)
def export(self):
tokens = []
for i in range(self.n_words):
# decode the token and light postprocessing
t = self.sp_model.id_to_piece(i)
if i == self.bos_id:
t = '\n<s>\n'
elif i == self.eos_id:
t = '\n</s>\n'
elif len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
t = chr(int(t[3:5], 16)) # e.g. make '<0x01>' into '\x01'
t = t.replace('', ' ') # sentencepiece uses this as the whitespace
tokens.append(t)
with open(TOKENIZER_BIN, 'wb') as f:
for token in tokens:
bytes = token.encode('utf-8')
f.write((len(bytes)).to_bytes(4, 'little')) # write length of bytes
f.write(bytes) # write token bytes
if __name__ == "__main__":
t = Tokenizer()
t.export()