big change: adding prompting. many LOC, but critical. ty @atamurad for the first draft, i ended up tuning it quite a bit.

2023-07-28 04:12:54 +00:00
parent 568a651c45
commit b4bb47bb7b
3 changed files with 130 additions and 30 deletions
@@ -3,6 +3,7 @@
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.

 import os
+import struct
 from logging import getLogger
 from typing import List

@@ -39,26 +40,35 @@ class Tokenizer:
        return self.sp_model.decode(t)

    def export(self):
-        tokens = []
+
+        # get all the tokens (postprocessed) and their scores as floats
+        tokens, scores = [], []
        for i in range(self.n_words):

            # decode the token and light postprocessing
            t = self.sp_model.id_to_piece(i)
+            s = self.sp_model.get_score(i)
            if i == self.bos_id:
                t = '\n<s>\n'
            elif i == self.eos_id:
                t = '\n</s>\n'
            elif len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
                t = chr(int(t[3:5], 16)) # e.g. make '<0x01>' into '\x01'
-            t = t.replace('▁', ' ') # sentencepiece uses this as the whitespace
+            t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
+            b = t.encode('utf-8') # bytes of this token, utf-8 encoded

-            tokens.append(t)
+            tokens.append(b)
+            scores.append(s)
        
+        # record the max token length
+        max_token_length = max(len(t) for t in tokens)
+
+        # write to a binary file
        with open(TOKENIZER_BIN, 'wb') as f:
-            for token in tokens:
-                bytes = token.encode('utf-8')
-                f.write((len(bytes)).to_bytes(4, 'little'))  # write length of bytes
-                f.write(bytes)  # write token bytes
+            f.write(struct.pack("I", max_token_length))
+            for bytes, score in zip(tokens, scores):
+                f.write(struct.pack("fI", score, len(bytes)))
+                f.write(bytes)

 if __name__ == "__main__":
    t = Tokenizer()