add back careful processing of the byte tokens

2023-08-15 01:42:33 +00:00
parent 4bf36ecc17
commit d459fd4243
1 changed files with 15 additions and 1 deletions
@@ -10,6 +10,7 @@ $ ./run

 #include <stdio.h>
 #include <stdlib.h>
+#include <ctype.h>
 #include <time.h>
 #include <math.h>
 #include <string.h>
@@ -688,7 +689,20 @@ int main(int argc, char *argv[]) {

        // following BOS (1) token, sentencepiece decoder strips any leading whitespace (see PR #89)
        char *token_str = (token == 1 && vocab[next][0] == ' ') ? vocab[next]+1 : vocab[next];
-        printf("%s", token_str);
+        // careful, some tokens designate raw bytes, and look like e.g. '<0x01>'
+        unsigned char byte_val;
+        if (sscanf(token_str, "<0x%02hhX>", &byte_val) == 1) {
+            // ok this token is a raw byte token, carefuly to only print printable chars or whitespace
+            // some of the other bytes can be various control codes, backspace, etc. => skip
+            if (isprint(byte_val) || isspace(byte_val)) {
+                char byte_piece[2];
+                byte_piece[0] = byte_val;
+                byte_piece[1] = '\0';
+                printf("%s", byte_piece);
+            }
+        } else {
+            printf("%s", token_str);
+        }
        fflush(stdout);
        token = next;