From 9c3cfb46a32cc529792f8ae08217035d997c1b3b Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 13 Aug 2023 03:08:07 +0000 Subject: [PATCH] make default be the llama2 tokenizer --- train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index 39b4f49..24d6fa6 100644 --- a/train.py +++ b/train.py @@ -46,8 +46,8 @@ wandb_run_name = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S") # data batch_size = 128 # if gradient_accumulation_steps > 1, this is the micro-batch size max_seq_len = 256 -vocab_source = "custom" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained -vocab_size = 512 +vocab_source = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained +vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens # model dim = 288 n_layers = 6