From 570789aa04e2c487c18778d71f16c33f1bf45d04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mihai=20Nad=C4=83=C8=99?= <mihai@nadas.ro>
Date: Sun, 13 Aug 2023 17:49:10 +0300
Subject: [PATCH] Fixes https://github.com/karpathy/llama2.c/issues/280

There was a small bug in tinystories.py, described here: https://github.com/karpathy/llama2.c/issues/280

This commit simply passes vocab_size to get_tokenizer_model_path to avoid silent crash when processing shards (in process_shard)
---
 tinystories.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tinystories.py b/tinystories.py
index 278c817..690cb02 100644
--- a/tinystories.py
+++ b/tinystories.py
@@ -120,7 +120,7 @@ def train_vocab(vocab_size):
 
 def process_shard(args, vocab_size):
     shard_id, shard = args
-    tokenizer_model = get_tokenizer_model_path()
+    tokenizer_model = get_tokenizer_model_path(vocab_size)
     enc = Tokenizer(tokenizer_model)
     with open(shard, "r") as f:
         data = json.load(f)