prompt tokenizer improvements: utf8 support, add_dummy_prefix and byte_fallback options to match sentencepiece

This commit is contained in:
atamyrat
2023-08-04 04:18:20 +03:00
parent 3c3b19b14c
commit c02865df30
3 changed files with 38 additions and 10 deletions
-2
View File
@@ -52,8 +52,6 @@ class Tokenizer:
t = '\n<s>\n'
elif i == self.eos_id:
t = '\n</s>\n'
elif len(t) == 6 and t.startswith('<0x') and t.endswith('>'):
t = chr(int(t[3:5], 16)) # e.g. make '<0x01>' into '\x01'
t = t.replace('', ' ') # sentencepiece uses this character as whitespace
b = t.encode('utf-8') # bytes of this token, utf-8 encoded