Fixes https://github.com/karpathy/llama2.c/issues/280

There was a small bug in tinystories.py, described here: https://github.com/karpathy/llama2.c/issues/280 This commit simply passes vocab_size to get_tokenizer_model_path to avoid silent crash when processing shards (in process_shard)
2023-08-13 17:49:10 +03:00
parent 8b472ded1f
commit 570789aa04
1 changed files with 1 additions and 1 deletions
@@ -120,7 +120,7 @@ def train_vocab(vocab_size):

 def process_shard(args, vocab_size):
    shard_id, shard = args
-    tokenizer_model = get_tokenizer_model_path()
+    tokenizer_model = get_tokenizer_model_path(vocab_size)
    enc = Tokenizer(tokenizer_model)
    with open(shard, "r") as f:
        data = json.load(f)