ok i can train and sample a model with a custom tokenizer
This commit is contained in:
@@ -11,12 +11,13 @@ from torch import nn
|
||||
|
||||
@dataclass
|
||||
class ModelArgs:
|
||||
# default hyperparameters for the Llama 7B model
|
||||
dim: int = 4096
|
||||
n_layers: int = 32
|
||||
n_heads: int = 32
|
||||
n_kv_heads: Optional[int] = None
|
||||
vocab_size: int = -1 # defined later by tokenizer
|
||||
multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2
|
||||
vocab_size: int = 32000
|
||||
multiple_of: int = 256 # MLP hidden layer size will be multiple of
|
||||
norm_eps: float = 1e-5
|
||||
max_seq_len: int = 2048
|
||||
dropout: float = 0.0
|
||||
|
||||
Reference in New Issue
Block a user