From befe4867b34723d0ba95d30784a42e4f522a4057 Mon Sep 17 00:00:00 2001
From: rdentato <rdentato@gmail.com>
Date: Wed, 16 Aug 2023 07:42:53 +0000
Subject: [PATCH 1/2] minimal protection against invalid UTF8 encoding.

---
 run.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/run.c b/run.c
index 43af271..70951c0 100644
--- a/run.c
+++ b/run.c
@@ -396,7 +396,8 @@ void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, u
         str_buffer[str_len] = '\0';
 
         // while the next character is a continuation byte, continue appending
-        if ((*(c+1) & 0xC0) == 0x80) {
+        // but if there are too many of them, just stop to avoid overruning str_buffer size.
+        if ((*(c+1) & 0xC0) == 0x80 && str_len < 4) {
             continue;
         }
 
@@ -414,6 +415,7 @@ void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, u
                 tokens[(*n_tokens)++] = (unsigned char)str_buffer[i] + 3;
             }
         }
+        str_len = 0; // protect against a sequence of stray UTF8 continuation bytes
     }
 
     // merge the best consecutive pair each iteration, according the scores in vocab_scores

From 55e60740f5c94ec37f66212864242bb6ee910065 Mon Sep 17 00:00:00 2001
From: rdentato <rdentato@gmail.com>
Date: Wed, 16 Aug 2023 07:58:07 +0000
Subject: [PATCH 2/2] Added space to str_buffer in case max_token_length is 1.

---
 run.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run.c b/run.c
index 70951c0..513eda9 100644
--- a/run.c
+++ b/run.c
@@ -362,7 +362,7 @@ void bpe_encode(char *text, char **vocab, float *vocab_scores, int vocab_size, u
     qsort(sorted_vocab, vocab_size, sizeof(TokenIndex), compare_tokens);
 
     // create a temporary buffer that will store merge candidates of always two consecutive tokens
-    char* str_buffer = malloc((max_token_length*2+1) * sizeof(char)); // *2 for concat, +1 for null terminator
+    char* str_buffer = malloc((max_token_length*2 +1 +2) * sizeof(char)); // *2 for concat, +1 for null terminator +2 for UTF8 (in case max_token_lenght is 1)
     size_t str_len = 0;
 
     // add_dummy_prefix is true by default