delete the run_wrap file! yay. ty @python273 and @ggerganov for code snippets
This commit is contained in:
@@ -9,6 +9,7 @@ from typing import List
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
|
||||
TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C
|
||||
|
||||
class Tokenizer:
|
||||
def __init__(self):
|
||||
@@ -36,3 +37,29 @@ class Tokenizer:
|
||||
|
||||
def decode(self, t: List[int]) -> str:
|
||||
return self.sp_model.decode(t)
|
||||
|
||||
def export(self):
|
||||
tokens = []
|
||||
for i in range(self.n_words):
|
||||
|
||||
# decode the token and light postprocessing
|
||||
t = self.sp_model.id_to_piece(i)
|
||||
if i == self.bos_id:
|
||||
t = '\n<s>\n'
|
||||
elif i == self.eos_id:
|
||||
t = '\n</s>\n'
|
||||
elif len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
|
||||
t = chr(int(t[3:5], 16)) # e.g. make '<0x01>' into '\x01'
|
||||
t = t.replace('▁', ' ') # sentencepiece uses this as the whitespace
|
||||
|
||||
tokens.append(t)
|
||||
|
||||
with open(TOKENIZER_BIN, 'wb') as f:
|
||||
for token in tokens:
|
||||
bytes = token.encode('utf-8')
|
||||
f.write((len(bytes)).to_bytes(4, 'little')) # write length of bytes
|
||||
f.write(bytes) # write token bytes
|
||||
|
||||
if __name__ == "__main__":
|
||||
t = Tokenizer()
|
||||
t.export()
|
||||
|
||||
Reference in New Issue
Block a user