fix sample.py from tokenizer changes before

2023-08-15 02:33:01 +00:00
parent a9a0628c92
commit fe2de68688
1 changed files with 7 additions and 2 deletions
@@ -51,11 +51,16 @@ if compile:
    print("Compiling the model...")
    model = torch.compile(model) # requires PyTorch 2.0 (optional)
-# load the tokenizer, either provided, or attempt to find it
+# load the tokenizer
 vocab_source = checkpoint_dict.get("vocab_source", "llama2")
 vocab_size = gptconf.vocab_size
 if tokenizer:
    # a specific tokenizer is provided, use it
    tokenizer_model = tokenizer
 else:
-    tokenizer_model = get_tokenizer_model_path(vocab_size=gptconf.vocab_size)
+    # let's try to find the tokenizer model automatically. bit gross here...
    query_vocab_size = 0 if vocab_source == "llama2" else vocab_size
    tokenizer_model = get_tokenizer_model_path(vocab_size=query_vocab_size)
 enc = Tokenizer(tokenizer_model=tokenizer_model)
 # encode the beginning of the prompt