add back careful processing of the byte tokens
This commit is contained in:
@@ -10,6 +10,7 @@ $ ./run
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <time.h>
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
@@ -688,7 +689,20 @@ int main(int argc, char *argv[]) {
|
||||
|
||||
// following BOS (1) token, sentencepiece decoder strips any leading whitespace (see PR #89)
|
||||
char *token_str = (token == 1 && vocab[next][0] == ' ') ? vocab[next]+1 : vocab[next];
|
||||
printf("%s", token_str);
|
||||
// careful, some tokens designate raw bytes, and look like e.g. '<0x01>'
|
||||
unsigned char byte_val;
|
||||
if (sscanf(token_str, "<0x%02hhX>", &byte_val) == 1) {
|
||||
// ok this token is a raw byte token, carefuly to only print printable chars or whitespace
|
||||
// some of the other bytes can be various control codes, backspace, etc. => skip
|
||||
if (isprint(byte_val) || isspace(byte_val)) {
|
||||
char byte_piece[2];
|
||||
byte_piece[0] = byte_val;
|
||||
byte_piece[1] = '\0';
|
||||
printf("%s", byte_piece);
|
||||
}
|
||||
} else {
|
||||
printf("%s", token_str);
|
||||
}
|
||||
fflush(stdout);
|
||||
token = next;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user