diff --git a/train.py b/train.py
index 39b4f49..24d6fa6 100644
--- a/train.py
+++ b/train.py
@@ -46,8 +46,8 @@ wandb_run_name = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
 # data
 batch_size = 128  # if gradient_accumulation_steps > 1, this is the micro-batch size
 max_seq_len = 256
-vocab_source = "custom" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
-vocab_size = 512
+vocab_source = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
+vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens
 # model
 dim = 288
 n_layers = 6