prompt tokenizer improvements: utf8 support, add_dummy_prefix and byte_fallback options to match sentencepiece

2023-08-04 04:18:20 +03:00
parent 3c3b19b14c
commit c02865df30
3 changed files with 38 additions and 10 deletions
@@ -52,8 +52,6 @@ class Tokenizer:
                t = '\n<s>\n'
            elif i == self.eos_id:
                t = '\n</s>\n'
-            elif len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
-                t = chr(int(t[3:5], 16)) # e.g. make '<0x01>' into '\x01'
            t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
            b = t.encode('utf-8') # bytes of this token, utf-8 encoded