delete the run_wrap file! yay. ty @python273 and @ggerganov for code snippets

2023-07-24 04:02:57 +00:00
parent 44ecc784da
commit 3bfa5665d1
5 changed files with 60 additions and 52 deletions
@@ -9,6 +9,7 @@ from typing import List
 from sentencepiece import SentencePieceProcessor

 TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
+TOKENIZER_BIN = "tokenizer.bin" # binary version of the tokenizer for inference in C

 class Tokenizer:
    def __init__(self):
@@ -36,3 +37,29 @@ class Tokenizer:

    def decode(self, t: List[int]) -> str:
        return self.sp_model.decode(t)
+
+    def export(self):
+        tokens = []
+        for i in range(self.n_words):
+
+            # decode the token and light postprocessing
+            t = self.sp_model.id_to_piece(i)
+            if i == self.bos_id:
+                t = '\n<s>\n'
+            elif i == self.eos_id:
+                t = '\n</s>\n'
+            elif len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
+                t = chr(int(t[3:5], 16)) # e.g. make '<0x01>' into '\x01'
+            t = t.replace('▁', ' ') # sentencepiece uses this as the whitespace
+
+            tokens.append(t)
+        
+        with open(TOKENIZER_BIN, 'wb') as f:
+            for token in tokens:
+                bytes = token.encode('utf-8')
+                f.write((len(bytes)).to_bytes(4, 'little'))  # write length of bytes
+                f.write(bytes)  # write token bytes
+
+if __name__ == "__main__":
+    t = Tokenizer()
+    t.export()