From befe4867b34723d0ba95d30784a42e4f522a4057 Mon Sep 17 00:00:00 2001 From: rdentato Date: Wed, 16 Aug 2023 07:42:53 +0000 Subject: [PATCH] minimal protection against invalid UTF8 encoding. --- run.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/run.c b/run.c index 43af271..70951c0 100644 --- a/run.c +++ b/run.c @@ -396,7 +396,8 @@ void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, u str_buffer[str_len] = '\0'; // while the next character is a continuation byte, continue appending - if ((*(c+1) & 0xC0) == 0x80) { + // but if there are too many of them, just stop to avoid overruning str_buffer size. + if ((*(c+1) & 0xC0) == 0x80 && str_len < 4) { continue; } @@ -414,6 +415,7 @@ void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, u tokens[(*n_tokens)++] = (unsigned char)str_buffer[i] + 3; } } + str_len = 0; // protect against a sequence of stray UTF8 continuation bytes } // merge the best consecutive pair each iteration, according the scores in vocab_scores