mark ModelArgs.hidden_dim as optional and calculate as previously if not provided
This commit is contained in:
@@ -17,7 +17,7 @@ class ModelArgs:
|
|||||||
n_heads: int = 32
|
n_heads: int = 32
|
||||||
n_kv_heads: Optional[int] = None
|
n_kv_heads: Optional[int] = None
|
||||||
vocab_size: int = 32000
|
vocab_size: int = 32000
|
||||||
hidden_dim: int = (4 * 4096)
|
hidden_dim: Optional[int] = None
|
||||||
multiple_of: int = 256 # MLP hidden layer size will be multiple of
|
multiple_of: int = 256 # MLP hidden layer size will be multiple of
|
||||||
norm_eps: float = 1e-5
|
norm_eps: float = 1e-5
|
||||||
max_seq_len: int = 2048
|
max_seq_len: int = 2048
|
||||||
@@ -167,6 +167,8 @@ class Attention(nn.Module):
|
|||||||
class FeedForward(nn.Module):
|
class FeedForward(nn.Module):
|
||||||
def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
|
def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
if hidden_dim is None:
|
||||||
|
hidden_dim = 4 * dim
|
||||||
hidden_dim = int(2 * hidden_dim / 3)
|
hidden_dim = int(2 * hidden_dim / 3)
|
||||||
hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
||||||
self.w1 = nn.Linear(dim, hidden_dim, bias=False)
|
self.w1 = nn.Linear(dim, hidden_dim, bias=False)
|
||||||
|
|||||||
Reference in New Issue
Block a user