ok i can train and sample a model with a custom tokenizer

This commit is contained in:
Andrej Karpathy
2023-08-11 16:47:29 +00:00
parent 4c6f0af9ff
commit b0cfa2458d
4 changed files with 48 additions and 14 deletions
+5 -1
View File
@@ -9,6 +9,8 @@ import tiktoken
from model import ModelArgs, Transformer
from tokenizer import Tokenizer
from tinystories import get_tokenizer_model_path
# -----------------------------------------------------------------------------
out_dir = 'out' # ignored if init_from is not 'resume'
start = "" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
@@ -51,7 +53,9 @@ if compile:
model = torch.compile(model) # requires PyTorch 2.0 (optional)
# load the tokenizer
enc = Tokenizer()
assert checkpoint["config"]["dataset"] == "tinystories" # TODO: generalize
tokenizer_model = get_tokenizer_model_path(vocab_size=gptconf.vocab_size)
enc = Tokenizer(tokenizer_model=tokenizer_model)
# encode the beginning of the prompt
if start.startswith('FILE:'):