From 01df3731d6747659ad4d8cf7d9f4bcb27eb6d5f0 Mon Sep 17 00:00:00 2001 From: YiMing Han Date: Fri, 18 Aug 2023 15:09:24 -0400 Subject: [PATCH] only dart --- .github/workflows/build.yml | 193 ------------------ configurator.py | 47 ----- export_meta_llama_bin.py | 112 ----------- export_meta_llama_hf_bin.py | 113 ----------- model.py | 392 ------------------------------------ requirements.txt | 7 - run.ipynb | 130 ------------ sample.py | 79 -------- save_torchscript.py | 66 ------ tinystories.py | 274 ------------------------- tokenizer.py | 78 ------- train.py | 342 ------------------------------- train_vocab.sh | 126 ------------ 13 files changed, 1959 deletions(-) delete mode 100644 .github/workflows/build.yml delete mode 100644 configurator.py delete mode 100644 export_meta_llama_bin.py delete mode 100644 export_meta_llama_hf_bin.py delete mode 100644 model.py delete mode 100644 requirements.txt delete mode 100644 run.ipynb delete mode 100644 sample.py delete mode 100755 save_torchscript.py delete mode 100644 tinystories.py delete mode 100644 tokenizer.py delete mode 100644 train.py delete mode 100755 train_vocab.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index 7e6474d..0000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,193 +0,0 @@ -name: Continuous Integration - -on: - push: - branches: - - master - paths: ['.github/workflows/**', '**/Makefile', '**/*.c', '**/*.h', '**/*.py'] - pull_request: - types: [opened, synchronize, reopened] - paths: ['**/Makefile', '**/*.c', '**/*.h', '**/*.py'] - # for manual triggering - workflow_dispatch: - -env: - BRANCH_NAME: ${{ github.head_ref || github.ref_name }} - -jobs: - # check basic builds to avoid breaking changes - ubuntu-focal-make: - runs-on: ubuntu-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential -y - - - name: Set up Python 3.10 - uses: actions/setup-python@v3 - with: - python-version: "3.10" - - - name: Pip setup - run: | - python -m pip install --upgrade pip - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - - name: Build - id: make_build - run: | - make - - - name: Build runfast - id: make_build_runfast - run: | - make runfast - - - name: Test with pytest - run: | - pytest - - macOS-latest-make: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Set up Python 3.10 - uses: actions/setup-python@v3 - with: - python-version: "3.10" - - - name: Pip setup - run: | - python -m pip install --upgrade pip - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - - name: Build clang - id: make_build_clang - run: | - make run CC=clang - - - name: Build - id: make_build - run: | - make - - - name: Build runfast - id: make_build_runfast - run: | - make runfast - - - name: Test with pytest - run: pytest - - - - - windows-latest-make: - runs-on: windows-latest - - strategy: - fail-fast: false #necessary, otherwise the matrix breaks - matrix: - arch: - - amd64 - - amd64_x86 - - amd64_arm64 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - - - name: Setup MSBuild - uses: microsoft/setup-msbuild@v1 - - - name: Setup MSVC ${{ matrix.arch }} - uses: ilammy/msvc-dev-cmd@v1 - with: - arch: ${{ matrix.arch }} - - - name: Set up Python 3.10 - if: matrix.arch != 'amd64_arm64' - uses: actions/setup-python@v3 - with: - python-version: "3.10" - - - name: Pip setup - if: matrix.arch != 'amd64_arm64' - run: | - python -m pip install --upgrade pip - if (Test-Path requirements.txt) { - pip install -r requirements.txt - } - - - name: Build ${{ matrix.arch }} - id: build_msvc - run: | - .\build_msvc.bat - - #cross-comiled, cannot be run on host - - name: Test with pytest - if: matrix.arch != 'amd64_arm64' - run: pytest - - windows-latest-mingw: - runs-on: windows-latest - - defaults: - run: - shell: msys2 {0} - - strategy: - matrix: - include: - - { sys: mingw64, env: x86_64 } - - steps: - - name: Checkout - id: checkout - uses: actions/checkout@v3 - - - uses: msys2/setup-msys2@v2 - id: setup-msys2 - with: - msystem: ${{ matrix.sys }} - install: mingw-w64-${{matrix.env}}-gcc make - - - name: Build ${{ matrix.sys }} ${{ matrix.env }} - id: build_mingw - run: | - make win64 - - - name: Set up Python 3.10 - uses: actions/setup-python@v3 - with: - python-version: "3.10" - - - name: Pip setup - shell: powershell - run: | - python -m pip install --upgrade pip - if (Test-Path requirements.txt) { - pip install -r requirements.txt - } - - - name: Test with pytest - shell: powershell - run: pytest diff --git a/configurator.py b/configurator.py deleted file mode 100644 index a8bba95..0000000 --- a/configurator.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Poor Man's Configurator. Probably a terrible idea. Example usage: -$ python train.py config/override_file.py --batch_size=32 -this will first run config/override_file.py, then override batch_size to 32 - -The code in this file will be run as follows from e.g. train.py: ->>> exec(open('configurator.py').read()) - -So it's not a Python module, it's just shuttling this code away from train.py -The code in this script then overrides the globals() - -I know people are not going to love this, I just really dislike configuration -complexity and having to prepend config. to every single variable. If someone -comes up with a better simple Python solution I am all ears. -""" - -import sys -from ast import literal_eval - -for arg in sys.argv[1:]: - if '=' not in arg: - # assume it's the name of a config file - assert not arg.startswith('--') - config_file = arg - print(f"Overriding config with {config_file}:") - with open(config_file) as f: - print(f.read()) - exec(open(config_file).read()) - else: - # assume it's a --key=value argument - assert arg.startswith('--') - key, val = arg.split('=') - key = key[2:] - if key in globals(): - try: - # attempt to eval it it (e.g. if bool, number, or etc) - attempt = literal_eval(val) - except (SyntaxError, ValueError): - # if that goes wrong, just use the string - attempt = val - # ensure the types match ok - assert type(attempt) == type(globals()[key]) - # cross fingers - print(f"Overriding: {key} = {attempt}") - globals()[key] = attempt - else: - raise ValueError(f"Unknown config key: {key}") diff --git a/export_meta_llama_bin.py b/export_meta_llama_bin.py deleted file mode 100644 index 4e42197..0000000 --- a/export_meta_llama_bin.py +++ /dev/null @@ -1,112 +0,0 @@ -""" -This script exports the Llama 2 weights in llama2c.bin format. -""" -import os -import sys -import struct -from pathlib import Path -import json - -import torch - -from model import precompute_freqs_cis - - -def export(p, state_dict, filepath='model.bin'): - """export the model weights in fp32 into .bin file to be read from C""" - f = open(filepath, 'wb') - - def serialize(key): - print(f"writing {key}...") - t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy() - f.write(memoryview(t)) - del state_dict[key] - - # first write out the header - hidden_dim = state_dict['layers.0.feed_forward.w1.weight'].shape[0] - p['vocab_size'] = 32000 - p['max_seq_len'] = 2048 - - n_kv_heads = p.get('n_kv_heads') or p['n_heads'] - header = struct.pack( - 'iiiiiii', - p['dim'], hidden_dim, p['n_layers'], p['n_heads'], - n_kv_heads, -p['vocab_size'], p['max_seq_len'] - ) - # NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present - # in the checkpoint and should be loaded. - f.write(header) - - # next write out the embedding weights - print("writing tok_embeddings...") - serialize('tok_embeddings.weight') - - # now all the layers - # attention weights - for i in range(p['n_layers']): serialize(f'layers.{i}.attention_norm.weight') - for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wq.weight') - for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wk.weight') - for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wv.weight') - for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wo.weight') - # ffn weights - for i in range(p['n_layers']): serialize(f'layers.{i}.ffn_norm.weight') - for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w1.weight') - for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w2.weight') - for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w3.weight') - - # final rmsnorm - serialize('norm.weight') - # freqs_cos, freqs_sin - freqs_cos, freqs_sin = precompute_freqs_cis(p['dim'] // p['n_heads'], p['max_seq_len'] * 2) - state_dict['freqs_cos'] = freqs_cos[:p['max_seq_len']] - state_dict['freqs_sin'] = freqs_sin[:p['max_seq_len']] - serialize('freqs_cos') - serialize('freqs_sin') - - # finally write the output weights - serialize('output.weight') - - f.close() - print(f"wrote {filepath}") - - -def concat_weights(models): - state_dict = {} - for name in list(models[0]): - tensors = [model[name] for model in models] - if len(tensors) == 1 or len(tensors[0].shape) == 1: - state_dict[name] = tensors[0] - continue - is_axis_1 = ( - name.startswith('tok_embeddings.') - or name.endswith('.attention.wo.weight') - or name.endswith('.feed_forward.w2.weight') - ) - axis = 1 if is_axis_1 else 0 - state_dict[name] = torch.cat(tensors, dim=axis) - for model in models: - del model[name] - return state_dict - - -def load_and_export(model_path, output_path): - params_path = os.path.join(model_path, 'params.json') - with open(params_path) as f: - params = json.load(f) - print(params) - - model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth'))) - models = [torch.load(p, map_location='cpu') for p in model_paths] - state_dict = concat_weights(models) - del models - export(params, state_dict, output_path) - - -if __name__ == '__main__': - if len(sys.argv) == 1: - print('[Llama model folder path] [output path]') - exit() - - model_path = sys.argv[1] - output_path = sys.argv[2] - load_and_export(model_path, output_path) diff --git a/export_meta_llama_hf_bin.py b/export_meta_llama_hf_bin.py deleted file mode 100644 index e3a8c73..0000000 --- a/export_meta_llama_hf_bin.py +++ /dev/null @@ -1,113 +0,0 @@ -""" -This script exports the Llama 2 weights in llama2c.bin format. -""" -import os -import sys -import struct -from pathlib import Path -import json - -import torch - -from model import precompute_freqs_cis - - -def export(p, state_dict, filepath='model.bin'): - """export the model weights in fp32 into .bin file to be read from C""" - f = open(filepath, 'wb') - - def serialize(key): - print(f"writing {key}...") - t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy() - f.write(memoryview(t)) - del state_dict[key] - - # first write out the header - hidden_dim = state_dict['model.layers.0.mlp.gate_proj.weight'].shape[0] - p['vocab_size'] = 32000 - p['max_seq_len'] = 2048 - - n_kv_heads = p.get('n_kv_heads') or p['n_heads'] - header = struct.pack( - 'iiiiiii', - p['dim'], hidden_dim, p['n_layers'], p['n_heads'], - n_kv_heads, -p['vocab_size'], p['max_seq_len'] - ) - # NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present - # in the checkpoint and should be loaded. - f.write(header) - - # next write out the embedding weights - print("writing tok_embeddings...") - serialize('model.embed_tokens.weight') - - # now all the layers - # attention weights - for i in range(p['n_layers']): serialize(f'model.layers.{i}.input_layernorm.weight') - for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.q_proj.weight') - for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.k_proj.weight') - for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.v_proj.weight') - for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.o_proj.weight') - # ffn weights - for i in range(p['n_layers']): serialize(f'model.layers.{i}.post_attention_layernorm.weight') - for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.gate_proj.weight') - for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.down_proj.weight') - for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.up_proj.weight') - - # final rmsnorm - serialize('model.norm.weight') - # freqs_cos, freqs_sin - freqs_cos, freqs_sin = precompute_freqs_cis(p['dim'] // p['n_heads'], p['max_seq_len'] * 2) - state_dict['freqs_cos'] = freqs_cos[:p['max_seq_len']] - state_dict['freqs_sin'] = freqs_sin[:p['max_seq_len']] - # check if this requires addtional conversion - serialize('freqs_cos') - serialize('freqs_sin') - - # finally write the output weights - serialize('lm_head.weight') - - f.close() - print(f"wrote {filepath}") - - -def concat_weights(models): - state_dict = {} - for name in list(models[0]): - tensors = [model[name] for model in models] - if len(tensors) == 1 or len(tensors[0].shape) == 1: - state_dict[name] = tensors[0] - continue - is_axis_1 = ( - name.startswith('model.embed_tokens.weight') - or name.endswith('.self_attn.o_proj.weight') - or name.endswith('.mlp.down_proj.weight') - ) - axis = 1 if is_axis_1 else 0 - state_dict[name] = torch.cat(tensors, dim=axis) - for model in models: - del model[name] - return state_dict - - -def load_and_export(model_path, output_path): - params_path = os.path.join(model_path, 'params.json') - with open(params_path) as f: - params = json.load(f) - print(params) - - model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth'))) - models = [torch.load(p, map_location='cpu') for p in model_paths] - state_dict = concat_weights(models) - del models - export(params, state_dict, output_path) - - -if __name__ == '__main__': - if len(sys.argv) == 1: - print('[Llama model folder path] [output path]') - exit() - - model_path = sys.argv[1] - output_path = sys.argv[2] - load_and_export(model_path, output_path) diff --git a/model.py b/model.py deleted file mode 100644 index c8c82a9..0000000 --- a/model.py +++ /dev/null @@ -1,392 +0,0 @@ -import math -import struct -import inspect -from dataclasses import dataclass -from typing import Any, Optional, Tuple - -import numpy as np -import torch -import torch.nn.functional as F -from torch import nn - -@dataclass -class ModelArgs: - # default hyperparameters for the Llama 7B model - dim: int = 4096 - n_layers: int = 32 - n_heads: int = 32 - n_kv_heads: Optional[int] = None - vocab_size: int = 32000 - multiple_of: int = 256 # MLP hidden layer size will be multiple of - norm_eps: float = 1e-5 - max_seq_len: int = 2048 - dropout: float = 0.0 - - -class RMSNorm(torch.nn.Module): - def __init__(self, dim: int, eps: float): - super().__init__() - self.eps = eps - self.weight = nn.Parameter(torch.ones(dim)) - - def _norm(self, x): - return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) - - def forward(self, x): - output = self._norm(x.float()).type_as(x) - return output * self.weight - - -def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): - freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) - t = torch.arange(end, device=freqs.device) # type: ignore - freqs = torch.outer(t, freqs).float() # type: ignore - freqs_cos = torch.cos(freqs) # real part - freqs_sin = torch.sin(freqs) # imaginary part - return freqs_cos, freqs_sin - -def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): - ndim = x.ndim - assert 0 <= 1 < ndim - assert freqs_cis.shape == (x.shape[1], x.shape[-1]) - shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] - return freqs_cis.view(shape) - -def apply_rotary_emb( - xq: torch.Tensor, - xk: torch.Tensor, - freqs_cos: torch.Tensor, - freqs_sin: torch.Tensor -) -> Tuple[torch.Tensor, torch.Tensor]: - - # reshape xq and xk to match the complex representation - xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1) - xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1) - - # reshape freqs_cos and freqs_sin for broadcasting - freqs_cos = reshape_for_broadcast(freqs_cos, xq_r) - freqs_sin = reshape_for_broadcast(freqs_sin, xq_r) - - # apply rotation using real numbers - xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin - xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos - xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin - xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos - - # flatten last two dimensions - xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3) - xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3) - - return xq_out.type_as(xq), xk_out.type_as(xk) - -def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: - """torch.repeat_interleave(x, dim=2, repeats=n_rep)""" - bs, slen, n_kv_heads, head_dim = x.shape - if n_rep == 1: - return x - return ( - x[:, :, :, None, :] - .expand(bs, slen, n_kv_heads, n_rep, head_dim) - .reshape(bs, slen, n_kv_heads * n_rep, head_dim) - ) - -class Attention(nn.Module): - def __init__(self, args: ModelArgs): - super().__init__() - self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads - assert args.n_heads % self.n_kv_heads == 0 - model_parallel_size = 1 - self.n_local_heads = args.n_heads // model_parallel_size - self.n_local_kv_heads = self.n_kv_heads // model_parallel_size - self.n_rep = self.n_local_heads // self.n_local_kv_heads - self.head_dim = args.dim // args.n_heads - self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False) - self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False) - self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False) - self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False) - self.attn_dropout = nn.Dropout(args.dropout) - self.resid_dropout = nn.Dropout(args.dropout) - self.dropout = args.dropout - - # use flash attention or a manual implementation? - self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') - if not self.flash: - print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0") - mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf")) - mask = torch.triu(mask, diagonal=1) - self.register_buffer("mask", mask) - - def forward( - self, - x: torch.Tensor, - freqs_cos: torch.Tensor, - freqs_sin: torch.Tensor, - ): - bsz, seqlen, _ = x.shape - - # QKV - xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) - xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim) - xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) - xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) - - # RoPE relative positional embeddings - xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin) - - # grouped multiquery attention: expand out keys and values - xk = repeat_kv(xk, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) - xv = repeat_kv(xv, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) - - # make heads into a batch dimension - xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) - xk = xk.transpose(1, 2) - xv = xv.transpose(1, 2) - - # flash implementation - if self.flash: - output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None, dropout_p=self.dropout if self.training else 0.0, is_causal=True) - else: - # manual implementation - scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim) - assert hasattr(self, 'mask') - scores = scores + self.mask[:, :, :seqlen, :seqlen] # (bs, n_local_heads, seqlen, cache_len + seqlen) - scores = F.softmax(scores.float(), dim=-1).type_as(xq) - scores = self.attn_dropout(scores) - output = torch.matmul(scores, xv) # (bs, n_local_heads, seqlen, head_dim) - - # restore time as batch dimension and concat heads - output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1) - - # final projection into the residual stream - output = self.wo(output) - output = self.resid_dropout(output) - return output - - -class FeedForward(nn.Module): - def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float): - super().__init__() - hidden_dim = int(2 * hidden_dim / 3) - hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - self.w1 = nn.Linear(dim, hidden_dim, bias=False) - self.w2 = nn.Linear(hidden_dim, dim, bias=False) - self.w3 = nn.Linear(dim, hidden_dim, bias=False) - self.dropout = nn.Dropout(dropout) - - def forward(self, x): - return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x))) - - -class TransformerBlock(nn.Module): - def __init__(self, layer_id: int, args: ModelArgs): - super().__init__() - self.n_heads = args.n_heads - self.dim = args.dim - self.head_dim = args.dim // args.n_heads - self.attention = Attention(args) - self.feed_forward = FeedForward( - dim=args.dim, - hidden_dim=4 * args.dim, - multiple_of=args.multiple_of, - dropout=args.dropout, - ) - self.layer_id = layer_id - self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps) - self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps) - - def forward(self, x, freqs_cos, freqs_sin): - h = x + self.attention.forward(self.attention_norm(x), freqs_cos, freqs_sin) - out = h + self.feed_forward.forward(self.ffn_norm(h)) - return out - - -class Transformer(nn.Module): - last_loss: Optional[torch.Tensor] - - def __init__(self, params: ModelArgs): - super().__init__() - self.params = params - self.vocab_size = params.vocab_size - self.n_layers = params.n_layers - - self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim) - self.dropout = nn.Dropout(params.dropout) - self.layers = torch.nn.ModuleList() - for layer_id in range(params.n_layers): - self.layers.append(TransformerBlock(layer_id, params)) - self.norm = RMSNorm(params.dim, eps=params.norm_eps) - self.output = nn.Linear(params.dim, params.vocab_size, bias=False) - - # share the unembedding parameters with the embedding parameters - self.tok_embeddings.weight = self.output.weight # https://paperswithcode.com/method/weight-tying - - # some useful precompute for the RoPE relative positional embeddings - freqs_cos, freqs_sin = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len) - self.register_buffer("freqs_cos", freqs_cos, persistent=False) - self.register_buffer("freqs_sin", freqs_sin, persistent=False) - - # init all weights - self.apply(self._init_weights) - # apply special scaled init to the residual projections, per GPT-2 paper - for pn, p in self.named_parameters(): - if pn.endswith('w3.weight') or pn.endswith('wo.weight'): - torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * params.n_layers)) - - # Initialize attribute for the loss of the last forward call. This will be set if the forward is called with a targets tensor. - self.last_loss = None - - def _init_weights(self, module): - if isinstance(module, nn.Linear): - torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) - if module.bias is not None: - torch.nn.init.zeros_(module.bias) - elif isinstance(module, nn.Embedding): - torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) - - def forward(self, tokens: torch.Tensor, targets: Optional[torch.Tensor] = None) -> torch.Tensor: - _bsz, seqlen = tokens.shape - h = self.tok_embeddings(tokens) - h = self.dropout(h) - freqs_cos = self.freqs_cos[:seqlen] - freqs_sin = self.freqs_sin[:seqlen] - - for layer in self.layers: - h = layer(h, freqs_cos, freqs_sin) - h = self.norm(h) - - if targets is not None: - # if we are given some desired targets also calculate the loss - logits = self.output(h) - self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1) - else: - # inference-time mini-optimization: only forward the output on the very last position - logits = self.output(h[:, [-1], :]) # note: using list [-1] to preserve the time dim - self.last_loss = None - - return logits - - def configure_optimizers(self, weight_decay, learning_rate, betas, device_type): - # start with all of the candidate parameters - param_dict = {pn: p for pn, p in self.named_parameters()} - # filter out those that do not require grad - param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad} - # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no. - # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't. - decay_params = [p for n, p in param_dict.items() if p.dim() >= 2] - nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2] - optim_groups = [ - {'params': decay_params, 'weight_decay': weight_decay}, - {'params': nodecay_params, 'weight_decay': 0.0} - ] - num_decay_params = sum(p.numel() for p in decay_params) - num_nodecay_params = sum(p.numel() for p in nodecay_params) - print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters") - print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters") - # Create AdamW optimizer and use the fused version if it is available - fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters - use_fused = fused_available and device_type == 'cuda' - extra_args = dict(fused=True) if use_fused else dict() - optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args) - print(f"using fused AdamW: {use_fused}") - - return optimizer - - def estimate_mfu(self, fwdbwd_per_iter, dt): - """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """ - # first estimate the number of flops we do per iteration. - # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311 - N = sum(p.numel() for p in self.parameters()) - cfg = self.params - L, H, Q, T = cfg.n_layers, cfg.n_heads, cfg.dim//cfg.n_heads, cfg.max_seq_len - flops_per_token = 6*N + 12*L*H*Q*T - flops_per_fwdbwd = flops_per_token * T - flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter - # express our flops throughput as ratio of A100 bfloat16 peak flops - flops_achieved = flops_per_iter * (1.0/dt) # per second - flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS - mfu = flops_achieved / flops_promised - return mfu - - @torch.inference_mode() - def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None): - """ - Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete - the sequence max_new_tokens times, feeding the predictions back into the model each time. - Most likely you'll want to make sure to be in model.eval() mode of operation for this. - Also note this is a super inefficient version of sampling with no key/value cache. - """ - for _ in range(max_new_tokens): - # if the sequence context is growing too long we must crop it at block_size - idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:] - # forward the model to get the logits for the index in the sequence - logits = self(idx_cond) - logits = logits[:, -1, :] # crop to just the final time step - if temperature == 0.0: - # "sample" the single most likely index - _, idx_next = torch.topk(logits, k=1, dim=-1) - else: - # pluck the logits at the final step and scale by desired temperature - logits = logits / temperature - # optionally crop the logits to only the top k options - if top_k is not None: - v, _ = torch.topk(logits, min(top_k, logits.size(-1))) - logits[logits < v[:, [-1]]] = -float('Inf') - # apply softmax to convert logits to (normalized) probabilities - probs = F.softmax(logits, dim=-1) - idx_next = torch.multinomial(probs, num_samples=1) - # append sampled index to the running sequence and continue - idx = torch.cat((idx, idx_next), dim=1) - - return idx - - def export(self, filepath='model.bin'): - """export the model weights in fp32 into .bin file to be read from C""" - f = open(filepath, 'wb') - - def serialize(t): - d = t.detach().cpu().view(-1).numpy().astype(np.float32) - b = struct.pack(f'{len(d)}f', *d) - f.write(b) - - # first write out the header - hidden_dim = self.layers[0].feed_forward.w1.weight.shape[0] - p = self.params - n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads - header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads, - n_kv_heads, p.vocab_size, p.max_seq_len) - f.write(header) - - # next write out the embedding weights - serialize(self.tok_embeddings.weight) - - # now all the layers - # attention weights - for layer in self.layers: - serialize(layer.attention_norm.weight) - for layer in self.layers: - serialize(layer.attention.wq.weight) - for layer in self.layers: - serialize(layer.attention.wk.weight) - for layer in self.layers: - serialize(layer.attention.wv.weight) - for layer in self.layers: - serialize(layer.attention.wo.weight) - # ffn weights - for layer in self.layers: - serialize(layer.ffn_norm.weight) - for layer in self.layers: - serialize(layer.feed_forward.w1.weight) - for layer in self.layers: - serialize(layer.feed_forward.w2.weight) - for layer in self.layers: - serialize(layer.feed_forward.w3.weight) - # final rmsnorm - serialize(self.norm.weight) - # note: no need to write final classifier weights due to weight sharing - # freqs_cis - serialize(self.freqs_cos[:p.max_seq_len]) - serialize(self.freqs_sin[:p.max_seq_len]) - - # write to binary file - f.close() - print(f"wrote {filepath}") diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 7187a73..0000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -numpy==1.23.5 -pytest==7.4.0 -Requests==2.31.0 -sentencepiece==0.1.99 -torch==2.0.1 -tqdm==4.64.1 -wandb==0.15.5 diff --git a/run.ipynb b/run.ipynb deleted file mode 100644 index ac57593..0000000 --- a/run.ipynb +++ /dev/null @@ -1,130 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "HLdoj4cz-xal" - }, - "source": [ - "# Run.c\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/karpathy/llama2.c/blob/master/run.ipynb)\n", - "\n", - "More details can be found in the [README.md](README.md) ." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Une3Ozlnu1B7" - }, - "outputs": [], - "source": [ - "#@title Clone Project\n", - "\n", - "!git clone https://github.com/karpathy/llama2.c.git\n", - "%cd llama2.c" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#@title Build\n", - "\n", - "!make runfast" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "thm0ZBrtSgoC" - }, - "outputs": [], - "source": [ - "#@title Pick Your Model\n", - "\n", - "#@markdown Choose model\n", - "model = \"stories15M\" #@param [\"stories15M\", \"stories42M\", \"stories110M\"]\n", - "\n", - "download_url = \"\"\n", - "\n", - "if(model == \"stories15M\"):\n", - " download_url = \"https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin\"\n", - "if(model == \"stories42M\"):\n", - " download_url = \"https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.bin\"\n", - "if(model == \"stories110M\"):\n", - " download_url = \"https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.bin\"\n", - "\n", - "print(f\"download_url: {download_url}\")\n", - "\n", - "!wget $download_url\n", - "\n", - "model_file = model + \".bin\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "OgAc3KjuT-NM" - }, - "outputs": [], - "source": [ - "#@title Generate Stories\n", - "\n", - "# Generate args\n", - "max_token = 256 #@param {type:\"slider\", min:32, max:1024, step:32}\n", - "temperature = 0.8 #@param {type:\"slider\", min:0.0, max:1, step:0.05}\n", - "top_p = 0.9 #@param {type:\"slider\", min:0.0, max:1.0, step:0.05}\n", - "prompt = \"One day, Lily met a Shoggoth\" #@param {type:\"string\"}\n", - "\n", - "print(f\"model: {model_file}, max_token: {max_token}, temperature: {temperature}, top_p: {top_p}, prompt: {prompt}\")\n", - "print(f\"----------------------------\\n\")\n", - "\n", - "cmd = f'./run {model_file} -t {temperature} -p {top_p} -n {max_token} -i \"{prompt}\"'\n", - "!{cmd}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#@title Run Meta's Llama 2 models\n", - "\n", - "#@markdown input your huggingface [access token](https://huggingface.co/settings/tokens) to download Meta's Llama 2 models.\n", - "\n", - "from huggingface_hub import snapshot_download\n", - "\n", - "token = \"replace your huggingface access token\" #@param {type:\"string\"}\n", - "path = snapshot_download(repo_id=\"meta-llama/Llama-2-7b\",cache_dir=\"Llama-2-7b\", use_auth_token=token)\n", - "\n", - "!python export_meta_llama_bin.py $path llama2_7b.bin\n", - "\n", - "print(\"./run llama2_7b.bin\\n\")\n", - "!./run llama2_7b.bin" - ] - } - ], - "metadata": { - "colab": { - "private_outputs": true, - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/sample.py b/sample.py deleted file mode 100644 index d2f56ea..0000000 --- a/sample.py +++ /dev/null @@ -1,79 +0,0 @@ -""" -Sample from the trained model with PyTorch -""" -import os -import pickle -from contextlib import nullcontext -import torch -from model import ModelArgs, Transformer -from tokenizer import Tokenizer - -from tinystories import get_tokenizer_model_path - -# ----------------------------------------------------------------------------- -checkpoint = 'out/ckpt.pt' -start = "" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt" -num_samples = 1 # number of samples to draw -max_new_tokens = 100 # number of tokens generated in each sample -temperature = 1.0 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions -top_k = 300 # retain only the top_k most likely tokens, clamp others to have 0 probability -tokenizer = "" # override the tokenizer model path -seed = 1337 -device = 'cuda' if torch.cuda.is_available() else 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc. -#dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16' -dtype = "float32" -compile = False # use PyTorch 2.0 to compile the model to be faster -exec(open('configurator.py').read()) # overrides from command line or config file -# ----------------------------------------------------------------------------- - -torch.manual_seed(seed) -torch.cuda.manual_seed(seed) -torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul -torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn -device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast -ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype] -ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype) - -# init from a model saved in a specific directory -checkpoint_dict = torch.load(checkpoint, map_location=device) -gptconf = ModelArgs(**checkpoint_dict['model_args']) -model = Transformer(gptconf) -state_dict = checkpoint_dict['model'] -unwanted_prefix = '_orig_mod.' -for k,v in list(state_dict.items()): - if k.startswith(unwanted_prefix): - state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) -model.load_state_dict(state_dict, strict=False) - -model.eval() -model.to(device) -if compile: - print("Compiling the model...") - model = torch.compile(model) # requires PyTorch 2.0 (optional) - -# load the tokenizer -vocab_source = checkpoint_dict.get("vocab_source", "llama2") -vocab_size = gptconf.vocab_size -if tokenizer: - # a specific tokenizer is provided, use it - tokenizer_model = tokenizer -else: - # let's try to find the tokenizer model automatically. bit gross here... - query_vocab_size = 0 if vocab_source == "llama2" else vocab_size - tokenizer_model = get_tokenizer_model_path(vocab_size=query_vocab_size) -enc = Tokenizer(tokenizer_model=tokenizer_model) - -# encode the beginning of the prompt -if start.startswith('FILE:'): - with open(start[5:], 'r', encoding='utf-8') as f: - start = f.read() -start_ids = enc.encode(start, bos=True, eos=False) -x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]) - -# run generation -with torch.no_grad(): - with ctx: - for k in range(num_samples): - y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k) - print(enc.decode(y[0].tolist())) - print('---------------') diff --git a/save_torchscript.py b/save_torchscript.py deleted file mode 100755 index af3a299..0000000 --- a/save_torchscript.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python -"""Saves the model as a TorchScript. - -Usage examples: - ./save_torchscript.py - ./save_torchscript.py --dim=300 - ./save_torchscript.py --gzip_output=True --zero_params=True - -The resulting file can be loaded in C++ code and then used for training or -inference with: - #include - torch::jit::Module module = torch::jit::load("model.pt") - -Note that the serialized model includes the initial parameters and with the default -ModelArgs the file is 59M and gzips down to 55M. If you want to serialize/distribute -the model parameters separately you can zero out the parameters before saving it and -it will gzip down to 780K. -""" -import gzip -import os -import shutil -from inspect import signature - -import torch - -from model import ModelArgs, Transformer - -# Model args config -dim = 288 -n_layers = 6 -n_heads = 6 -n_kv_heads = n_heads -multiple_of = 32 -max_seq_len = 256 -dropout = 0.0 -vocab_size = 32000 -norm_eps = 1e-5 -# Save config -model_path = "model.pt" -zero_params = False -gzip_output = False -# Allow config overrides -exec(open("configurator.py").read()) - - -def main() -> None: - model_args = {k: globals()[k] for k in signature(ModelArgs).parameters} - model = Transformer(ModelArgs(**model_args)) - - # If requested zero params before saving the model. This is useful in - # conjunction with gzip_output. - if zero_params: - for p in model.parameters(): - p.detach().zero_() - - torch.jit.save(torch.jit.script(model), model_path) - - if gzip_output: - with open(model_path, "rb") as f_in: - with gzip.open(f"{model_path}.gz", "wb") as f_out: - shutil.copyfileobj(f_in, f_out) - os.unlink(model_path) - - -if __name__ == "__main__": - main() diff --git a/tinystories.py b/tinystories.py deleted file mode 100644 index 690cb02..0000000 --- a/tinystories.py +++ /dev/null @@ -1,274 +0,0 @@ -""" -Download, preprocess and serve the TinyStories dataset as a DataLoader. -""" - -import argparse -import glob -import json -import os -import random -from typing import List -from concurrent.futures import ProcessPoolExecutor -from functools import partial - -import numpy as np -import requests -import torch -import torch.distributed as dist -from tqdm import tqdm - -from tokenizer import Tokenizer - -DATA_CACHE_DIR = "data" - -def download_file(url: str, fname: str, chunk_size=1024): - """Helper function to download a file from a given url""" - resp = requests.get(url, stream=True) - total = int(resp.headers.get("content-length", 0)) - with open(fname, "wb") as file, tqdm( - desc=fname, - total=total, - unit="iB", - unit_scale=True, - unit_divisor=1024, - ) as bar: - for data in resp.iter_content(chunk_size=chunk_size): - size = file.write(data) - bar.update(size) - - -def download(): - """Downloads the TinyStories dataset to DATA_CACHE_DIR""" - os.makedirs(DATA_CACHE_DIR, exist_ok=True) - - # download the TinyStories dataset, unless it's already downloaded - data_url = "https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz" - data_filename = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data.tar.gz") - if not os.path.exists(data_filename): - print(f"Downloading {data_url} to {data_filename}...") - download_file(data_url, data_filename) - else: - print(f"{data_filename} already exists, skipping download...") - - # unpack the tar.gz file into all the data shards (json files) - data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") - if not os.path.exists(data_dir): - os.makedirs(data_dir, exist_ok=True) - print(f"Unpacking {data_filename}...") - os.system(f"tar -xzf {data_filename} -C {data_dir}") - else: - print(f"{data_dir} already exists, skipping unpacking...") - - # print a single example just for debugging and such - shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json"))) - with open(shard_filenames[0], "r") as f: - data = json.load(f) - print("Download done.") - print(f"Number of shards: {len(shard_filenames)}") - print(f"Example story:\n{data[0]}") - -def train_vocab(vocab_size): - """ - Trains a custom sentencepiece tokenizer on the TinyStories dataset. - The custom tokenizer files will be saved in DATA_CACHE_DIR/tok{N} directories, - where N is the vocab size. This is also where the pretok .bin files will go. - """ - assert vocab_size > 0, "Vocab size must be positive" - - # output file prefix path for sentencepiece - prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}") - - # how many shards we'll use for vocab training, kept low for efficiency - num_shards = 10 - - # 1) export a large chunk of text as a single text file tiny.txt - tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt") - data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") - shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json"))) - - print(f"Writing temporary file {tiny_file} with {num_shards} shards...") - with open(tiny_file, "w") as of: - for shard in tqdm(shard_filenames[:num_shards]): - with open(shard, "r") as f: - data = json.load(f) - for example in data: - text = example["story"] - text = text.strip() - of.write(text + "\n") - print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB") - - # 2) run the train_vocab.sh script that trains the sentencepiece model - print("Will now train the vocab with:") - cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}" - print(cmd) - print("OK? [y/N] ") - dec = input() - if dec.lower() != "y": - print("Exiting...") - return - os.system(cmd) - - # 3) optional cleanup, ask the user if they'd like to delete tiny.txt - dec = input(f"Delete the temporary file {tiny_file}? [y/N] ") - if dec.lower() == "y": - os.remove(tiny_file) - print(f"Deleted {tiny_file}") - - print(f"Trained tokenizer is in {prefix}.model") - print("Done.") - - -def process_shard(args, vocab_size): - shard_id, shard = args - tokenizer_model = get_tokenizer_model_path(vocab_size) - enc = Tokenizer(tokenizer_model) - with open(shard, "r") as f: - data = json.load(f) - all_tokens = [] - for example in tqdm(data, position=shard_id): - text = example["story"] - text = text.strip() # get rid of leading/trailing whitespace - tokens = enc.encode(text, bos=True, eos=False) # encode the text, use BOS - all_tokens.extend(tokens) - # convert to uint16 nparray - all_tokens = np.array(all_tokens, dtype=np.uint16) - # calculate the output filename - if vocab_size == 0: - # if we're using Llama 2, just save the tokenized file in the same dir - tokenized_filename = shard.replace(".json", ".bin") - else: - # save .bin files into a new tok{N} directory - bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}") - shard_basename = os.path.basename(shard) - bin_basename = shard_basename.replace(".json", ".bin") - tokenized_filename = os.path.join(bin_dir, bin_basename) - # write the bytes - with open(tokenized_filename, "wb") as f: - f.write(all_tokens.tobytes()) - # calculate the average sequence length (they are separated by BOS=1) - avg_seq_len = all_tokens.size / ((all_tokens == 1).sum()) - print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}") - - -def pretokenize(vocab_size): - # iterate the shards and tokenize all of them one by one - data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") - shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json"))) - if vocab_size > 0: - # .bin files will be saved into tok{N} directory, create it once here - bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}") - os.makedirs(bin_dir, exist_ok=True) - - # process all the shards in a process pool - fun = partial(process_shard, vocab_size=vocab_size) - with ProcessPoolExecutor() as executor: - executor.map(fun, enumerate(shard_filenames)) - print("Done.") - - -class PretokDataset(torch.utils.data.IterableDataset): - """Loads pretokenized examples from disk and yields them as PyTorch tensors.""" - - def __init__(self, split, max_seq_len, vocab_size, vocab_source): - super().__init__() - self.split = split - self.max_seq_len = max_seq_len - self.vocab_size = vocab_size - self.vocab_source = vocab_source - - def __iter__(self): - # get worker info within a DataLoader - worker_info = torch.utils.data.get_worker_info() - worker_id = worker_info.id if worker_info else 0 - # get DDP rank info - rank = dist.get_rank() if dist.is_initialized() else 0 - # combine the worker_id and worker_rank to create a unique seed for rng - seed = 42 + worker_id + 1337 * rank - rng = random.Random(seed) - print(f"Created a PretokDataset with rng seed {seed}") - if self.vocab_source == "llama2": - # the .bin files are right along the .json files - bin_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data") - shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin"))) - elif self.vocab_source == "custom": - # the .bin files are in tok{N} directory - bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{self.vocab_size}") - shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin"))) - # train/test split. let's use only shard 0 for test split, rest train - shard_filenames = shard_filenames[1:] if self.split == "train" else shard_filenames[:1] - while True: - rng.shuffle(shard_filenames) - for shard in shard_filenames: - # open the dataset for reading but keep it on disk with memmap - m = np.memmap(shard, dtype=np.uint16, mode="r") - num_batches = len(m) // self.max_seq_len - num_batches -= 1 # drop the last partial batch - assert num_batches > 0, "this shard is way too small? investigate." - ixs = list(range(num_batches)) - rng.shuffle(ixs) - for ix in ixs: - start = ix * self.max_seq_len - end = start + self.max_seq_len + 1 - # calling .astype will copy the data into a new numpy array, now in RAM - chunk = torch.from_numpy((m[start:end]).astype(np.int64)) - x = chunk[:-1] - y = chunk[1:] - yield x, y - -# ----------------------------------------------------------------------------- -# public interface functions - -def get_tokenizer_model_path(vocab_size): - """ - Returns path to the sentencepiece tokenizer model for a given vocab size - vocab_size = 0 designates the default Llama 2 tokenizer, in that case - None is returned. - """ - if vocab_size == 0: - return None - else: - return os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model") - -class Task: - - @staticmethod - def iter_batches(batch_size, device, num_workers=0, **dataset_kwargs): - ds = PretokDataset(**dataset_kwargs) - dl = torch.utils.data.DataLoader( - ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers - ) - for x, y in dl: - x = x.to(device, non_blocking=True) - y = y.to(device, non_blocking=True) - yield x, y - -# ----------------------------------------------------------------------------- -# CLI for constructing the dataset - -if __name__ == "__main__": - """ - These stages are designed to be run in order. - - To tokenize data with the Llama 2 tokenizer: - python tinystories.py download - python tinystories.py pretokenize - - To tokenize data with a custom tokenizer we train ourselves with sentencepiece, e.g.: - python tinystories.py download - python tinystories.py train_vocab --vocab_size=2048 - python tinystories.py pretokenize --vocab_size=2048 - """ - parser = argparse.ArgumentParser() - parser.add_argument("stage", type=str, choices=["download", "pretokenize", "train_vocab"]) - parser.add_argument("--vocab_size", type=int, default=0, help="pretokenization vocab size. 0 = use Llama 2 tokenizer.") - args = parser.parse_args() - - # depending on the stage call the appropriate function - if args.stage == "download": - download() - elif args.stage == "train_vocab": - train_vocab(vocab_size=args.vocab_size) - elif args.stage == "pretokenize": - pretokenize(vocab_size=args.vocab_size) - else: - raise ValueError(f"Unknown stage {args.stage}") diff --git a/tokenizer.py b/tokenizer.py deleted file mode 100644 index f3c0cc3..0000000 --- a/tokenizer.py +++ /dev/null @@ -1,78 +0,0 @@ -# Taken from llama code and lightly modified -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. - -import os -import struct -import argparse -from typing import List - -from sentencepiece import SentencePieceProcessor - -TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model - -class Tokenizer: - def __init__(self, tokenizer_model=None): - model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL - assert os.path.isfile(model_path), model_path - self.sp_model = SentencePieceProcessor(model_file=model_path) - self.model_path = model_path - - # BOS / EOS token IDs - self.n_words: int = self.sp_model.vocab_size() - self.bos_id: int = self.sp_model.bos_id() - self.eos_id: int = self.sp_model.eos_id() - self.pad_id: int = self.sp_model.pad_id() - #print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}") - assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() - - def encode(self, s: str, bos: bool, eos: bool) -> List[int]: - assert type(s) is str - t = self.sp_model.encode(s) - if bos: - t = [self.bos_id] + t - if eos: - t = t + [self.eos_id] - return t - - def decode(self, t: List[int]) -> str: - return self.sp_model.decode(t) - - def export(self): - - # get all the tokens (postprocessed) and their scores as floats - tokens, scores = [], [] - for i in range(self.n_words): - - # decode the token and light postprocessing - t = self.sp_model.id_to_piece(i) - s = self.sp_model.get_score(i) - if i == self.bos_id: - t = '\n\n' - elif i == self.eos_id: - t = '\n\n' - t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace - b = t.encode('utf-8') # bytes of this token, utf-8 encoded - - tokens.append(b) - scores.append(s) - - # record the max token length - max_token_length = max(len(t) for t in tokens) - - # write to a binary file - # the tokenizer.bin file is the same as .model file, but .bin - tokenizer_bin = self.model_path.replace('.model', '.bin') - with open(tokenizer_bin, 'wb') as f: - f.write(struct.pack("I", max_token_length)) - for bytes, score in zip(tokens, scores): - f.write(struct.pack("fI", score, len(bytes))) - f.write(bytes) - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("-t", "--tokenizer-model", type=str, help="optional path to custom tokenizer ") - args = parser.parse_args() - - t = Tokenizer(args.tokenizer_model) - t.export() diff --git a/train.py b/train.py deleted file mode 100644 index b1972dc..0000000 --- a/train.py +++ /dev/null @@ -1,342 +0,0 @@ -""" -This training script can be run both on a single gpu in debug mode, -and also in a larger training run with distributed data parallel (ddp). - -To run on a single GPU small debug run, example: -$ python -m train.py --compile=False --eval_iters=10 --batch_size=8 - -To run with DDP on 4 gpus on 1 node, example: -$ torchrun --standalone --nproc_per_node=4 train.py - -To run with DDP on 4 gpus across 2 nodes, example: -- Run on the first (master) node with example IP 123.456.123.456: -$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py -- Run on the worker node: -$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py -(If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1) -""" - -import math -import os -import time -from contextlib import nullcontext -from datetime import datetime -from functools import partial - -import torch -from model import Transformer, ModelArgs -from torch.distributed import destroy_process_group, init_process_group -from torch.nn.parallel import DistributedDataParallel as DDP - -from tinystories import Task - -# ----------------------------------------------------------------------------- -# I/O -out_dir = "out" -eval_interval = 2000 -log_interval = 1 -eval_iters = 100 -eval_only = False # if True, script exits right after the first eval -always_save_checkpoint = False # if True, always save a checkpoint after each eval -init_from = "scratch" # 'scratch' or 'resume' -# wandb logging -wandb_log = False # disabled by default -wandb_project = "llamac" -wandb_run_name = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S") -# data -batch_size = 128 # if gradient_accumulation_steps > 1, this is the micro-batch size -max_seq_len = 256 -vocab_source = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained -vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens -# model -dim = 288 -n_layers = 6 -n_heads = 6 -n_kv_heads = 6 -multiple_of = 32 -dropout = 0.0 -# adamw optimizer -gradient_accumulation_steps = 4 # used to simulate larger batch sizes -learning_rate = 5e-4 # max learning rate -max_iters = 100000 # total number of training iterations -weight_decay = 1e-1 -beta1 = 0.9 -beta2 = 0.95 -grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0 -# learning rate decay settings -decay_lr = True # whether to decay the learning rate -warmup_iters = 1000 # how many steps to warm up for -# system -device = "cuda" # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks -dtype = "bfloat16" # float32|bfloat16|float16 -compile = True # use PyTorch 2.0 to compile the model to be faster -# ----------------------------------------------------------------------------- -config_keys = [ - k - for k, v in globals().items() - if not k.startswith("_") and isinstance(v, (int, float, bool, str)) -] -exec(open("configurator.py").read()) # overrides from command line or config file -config = {k: globals()[k] for k in config_keys} # will be useful for logging -# ----------------------------------------------------------------------------- - -# fixing some hyperparams to sensible defaults -lr_decay_iters = max_iters # should be ~= max_iters per Chinchilla -min_lr = 0.0 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla - -# validating checks -assert vocab_source in ["llama2", "custom"] -assert vocab_source == "custom" or vocab_size == 32000, "The vocab from Meta has 32K tokens" - -# various inits, derived attributes, I/O setup -ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run? -if ddp: - init_process_group(backend="nccl") - ddp_rank = int(os.environ["RANK"]) - ddp_local_rank = int(os.environ["LOCAL_RANK"]) - ddp_world_size = int(os.environ["WORLD_SIZE"]) - device = f"cuda:{ddp_local_rank}" - torch.cuda.set_device(device) - master_process = ddp_rank == 0 # this process will do logging, checkpointing etc. - seed_offset = ddp_rank # each process gets a different seed - # world_size number of processes will be training simultaneously, so we can scale - # down the desired gradient accumulation iterations per process proportionally - assert gradient_accumulation_steps % ddp_world_size == 0 - gradient_accumulation_steps //= ddp_world_size -else: - # if not ddp, we are running on a single gpu, and one process - master_process = True - seed_offset = 0 - ddp_world_size = 1 -tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len -if master_process: - print(f"tokens per iteration will be: {tokens_per_iter:,}") - print(f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch size * {max_seq_len} max seq len") - -if master_process: - os.makedirs(out_dir, exist_ok=True) -torch.manual_seed(1337 + seed_offset) -torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul -torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn -device_type = "cuda" if "cuda" in device else "cpu" # for later use in torch.autocast -# note: float16 data type will automatically use a GradScaler -ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[dtype] -ctx = ( - nullcontext() - if device_type == "cpu" - else torch.amp.autocast(device_type=device_type, dtype=ptdtype) -) - -# task-specific setup -iter_batches = partial( - Task.iter_batches, - batch_size=batch_size, - max_seq_len=max_seq_len, - vocab_size=vocab_size, - vocab_source=vocab_source, - device=device, - num_workers=0, -) - -# init these up here, can override if init_from='resume' (i.e. from a checkpoint) -iter_num = 0 -best_val_loss = 1e9 - -# model init -model_args = dict( - dim=dim, - n_layers=n_layers, - n_heads=n_heads, - n_kv_heads=n_kv_heads, - vocab_size=vocab_size, - multiple_of=multiple_of, - max_seq_len=max_seq_len, - dropout=dropout, -) # start with model_args from command line -if init_from == "scratch": - # init a new model from scratch - print("Initializing a new model from scratch") - gptconf = ModelArgs(**model_args) - model = Transformer(gptconf) -elif init_from == "resume": - print(f"Resuming training from {out_dir}") - # resume training from a checkpoint. - ckpt_path = os.path.join(out_dir, "ckpt.pt") - checkpoint = torch.load(ckpt_path, map_location=device) - checkpoint_model_args = checkpoint["model_args"] - # force these config attributes to be equal otherwise we can't even resume training - # the rest of the attributes (e.g. dropout) can stay as desired from command line - for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]: - model_args[k] = checkpoint_model_args[k] - # create the model - gptconf = ModelArgs(**model_args) - model = Transformer(gptconf) - state_dict = checkpoint["model"] - # fix the keys of the state dictionary :( - # honestly no idea how checkpoints sometimes get this prefix, have to debug more - unwanted_prefix = "_orig_mod." - for k, v in list(state_dict.items()): - if k.startswith(unwanted_prefix): - state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k) - model.load_state_dict(state_dict) - iter_num = checkpoint["iter_num"] - best_val_loss = checkpoint["best_val_loss"] -model.to(device) - -# initialize a GradScaler. If enabled=False scaler is a no-op -scaler = torch.cuda.amp.GradScaler(enabled=(dtype == "float16")) - -# optimizer -optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type) -if init_from == "resume" and "optimizer" in checkpoint: - optimizer.load_state_dict(checkpoint["optimizer"]) -checkpoint = None # free up memory - -# compile the model -if compile: - print("compiling the model... (takes a ~minute)") - unoptimized_model = model - model = torch.compile(model) # requires PyTorch 2.0 - -# wrap model into DDP container -if ddp: - # Ignore the `freqs_cis` buffer so that DDP does not broadcast it at - # construction time since NCCL does not support `ComplexFloat` - prefix = "_orig_mod." if compile else "" - model._ddp_params_and_buffers_to_ignore = {prefix + "freqs_cis"} - model = DDP(model, device_ids=[ddp_local_rank]) - -# helps estimate an arbitrarily accurate loss over either split using many batches -@torch.no_grad() -def estimate_loss(): - out = {} - model.eval() - for split in ["train", "val"]: - batch_iter = iter_batches(split=split) - losses = torch.zeros(eval_iters) # keep on CPU - for k in range(eval_iters): - X, Y = next(batch_iter) - with ctx: - logits = model(X, Y) - loss = raw_model.last_loss - losses[k] = loss.item() - out[split] = losses.mean() - model.train() - return out - -# learning rate decay scheduler (cosine with warmup) -def get_lr(it): - # 1) linear warmup for warmup_iters steps - if it < warmup_iters: - return learning_rate * it / warmup_iters - # 2) if it > lr_decay_iters, return min learning rate - if it > lr_decay_iters: - return min_lr - # 3) in between, use cosine decay down to min learning rate - decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters) - assert 0 <= decay_ratio <= 1 - coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1 - return min_lr + coeff * (learning_rate - min_lr) - -# logging -if wandb_log and master_process: - import wandb - wandb.init(project=wandb_project, name=wandb_run_name, config=config) - -# training loop -train_batch_iter = iter_batches(split="train") -X, Y = next(train_batch_iter) # fetch the very first batch -t0 = time.time() -local_iter_num = 0 # number of iterations in the lifetime of this process -raw_model = model.module if ddp else model # unwrap DDP container if needed -running_mfu = -1.0 -while True: - # determine and set the learning rate for this iteration - lr = get_lr(iter_num) if decay_lr else learning_rate - for param_group in optimizer.param_groups: - param_group["lr"] = lr - - # evaluate the loss on train/val sets and write checkpoints - if iter_num % eval_interval == 0 and master_process: - losses = estimate_loss() - print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") - if wandb_log: - try: - wandb.log( - { - "iter": iter_num, - "tokens": iter_num * tokens_per_iter, - "loss/train": losses["train"], - "loss/val": losses["val"], - "lr": lr, - "mfu": running_mfu * 100, # convert to percentage - } - ) - except Exception as e: - print(f"logging to wandb failed: {e}") - if losses["val"] < best_val_loss or always_save_checkpoint: - best_val_loss = losses["val"] - if iter_num > 0: - checkpoint = { - "model": raw_model.state_dict(), - "optimizer": optimizer.state_dict(), - "model_args": model_args, - "iter_num": iter_num, - "best_val_loss": best_val_loss, - "config": config, - } - print(f"saving checkpoint to {out_dir}") - torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt")) - raw_model.export(os.path.join(out_dir, "model.bin")) - if iter_num == 0 and eval_only: - break - - # forward backward update, with optional gradient accumulation to simulate larger batch size - # and using the GradScaler if data type is float16 - for micro_step in range(gradient_accumulation_steps): - if ddp: - # in DDP training we only need to sync gradients at the last micro step. - # the official way to do this is with model.no_sync() context manager, but - # I really dislike that this bloats the code and forces us to repeat code - # looking at the source of that context manager, it just toggles this variable - model.require_backward_grad_sync = micro_step == gradient_accumulation_steps - 1 - with ctx: - logits = model(X, Y) - loss = raw_model.last_loss - loss = loss / gradient_accumulation_steps - # immediately async prefetch next batch while model is doing the forward pass on the GPU - X, Y = next(train_batch_iter) - # backward pass, with gradient scaling if training in fp16 - scaler.scale(loss).backward() - # clip the gradient - if grad_clip != 0.0: - scaler.unscale_(optimizer) - torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) - # step the optimizer and scaler if training in fp16 - scaler.step(optimizer) - scaler.update() - # flush the gradients as soon as we can, no need for this memory anymore - optimizer.zero_grad(set_to_none=True) - - # timing and logging - t1 = time.time() - dt = t1 - t0 - t0 = t1 - if iter_num % log_interval == 0 and master_process: - # get loss as float, scale up due to the divide above. note: this is a CPU-GPU sync point - lossf = loss.item() * gradient_accumulation_steps - if local_iter_num >= 5: # let the training loop settle a bit - mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt) - running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu - print( - f"{iter_num} | loss {lossf:.4f} | lr {lr:e} | {dt*1000:.2f}ms | mfu {running_mfu*100:.2f}%" - ) - iter_num += 1 - local_iter_num += 1 - - # termination conditions - if iter_num > max_iters: - break - -if ddp: - destroy_process_group() diff --git a/train_vocab.sh b/train_vocab.sh deleted file mode 100755 index 7803af8..0000000 --- a/train_vocab.sh +++ /dev/null @@ -1,126 +0,0 @@ -#!/bin/bash - -# Trains a sentencepiece tokenizer model on a bunch of given data, my best -# effort attempt to replicate how Meta trained their Llama 2 tokenizer. - -# usage: $ train_vocab.sh -# example: -# ./train_vocab.sh tiny.txt tokenizer_tiny 1024 -# requirements: -# install https://github.com/google/sentencepiece - -# check if the correct number of arguments are provided -if [ $# -ne 3 ]; then - echo "Usage: $0 " - exit 1 -fi - -# assign command-line arguments to variables -input=$1 -model_prefix=$2 -vocab_size=$3 - -# check if input file exists -if [ ! -f "$input" ]; then - echo "Usage: $0 " - echo "input '$input' not found." - exit 1 -fi - -# check if vocab_size is a positive integer -if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -lt 1 ]; then - echo "Usage: $0 " - echo "vocab_size size must be a positive integer." - exit 1 -fi - -# Print the processed inputs -echo "Input: $input" -echo "Model Prefix: $model_prefix" -echo "Vocabulary Size: $vocab_size" - -# train a sentencepiece tokenizer model -# Llama 2 config can be printed as follows: - -# import sentencepiece.sentencepiece_model_pb2 -# mp = sentencepiece.sentencepiece_model_pb2.ModelProto() -# mp.ParseFromString(open("tokenizer.model", "rb").read()) -# print(mp.trainer_spec) -# print(mp.normalizer_spec) - -# this gives: - -# trainer_spec { -# input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged" -# model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2" -# model_type: BPE -# vocab_size: 32000 -# self_test_sample_size: 0 -# input_format: "text" -# character_coverage: 0.9999499917030334 -# input_sentence_size: 200000000 -# seed_sentencepiece_size: 1000000 -# shrinking_factor: 0.75 -# num_threads: 80 -# num_sub_iterations: 2 -# max_sentence_length: 4192 -# shuffle_input_sentence: true -# max_sentencepiece_length: 16 -# split_by_unicode_script: true -# split_by_whitespace: true -# split_by_number: true -# treat_whitespace_as_suffix: false -# split_digits: true -# allow_whitespace_only_pieces: true -# vocabulary_output_piece_score: true -# hard_vocab_limit: true -# use_all_vocab: false -# byte_fallback: true -# required_chars: "" -# unk_id: 0 -# bos_id: 1 -# eos_id: 2 -# pad_id: -1 -# unk_surface: " \342\201\207 " -# unk_piece: "" -# bos_piece: "" -# eos_piece: "" -# pad_piece: "" -# train_extremely_large_corpus: false -# enable_differential_privacy: false -# differential_privacy_noise_level: 0.0 -# differential_privacy_clipping_threshold: 0 -# } -# normalizer_spec { -# name: "identity" -# precompiled_charsmap: "" -# add_dummy_prefix: true -# remove_extra_whitespaces: false -# normalization_rule_tsv: "" -# } - -# let's now use spm_train to train this exact model -# options docs: https://github.com/google/sentencepiece/blob/master/doc/options.md - -# we'll depart on a few settings: -# character_coverage -> 1.0 - -# other important notes: -# --split-digits = true, per the paper -# --allow_whitespace_only_pieces is true, default in spm is false -# --byte_fallback is true, default in spm is false -# --normalization_rule_name is identity, default in spm is nmt_nfkc - -spm_train --input="$input" \ - --model_prefix="$model_prefix" \ - --model_type=bpe \ - --vocab_size="$vocab_size" \ - --self_test_sample_size=0 \ - --input_format="text" \ - --character_coverage=1.0 \ - --num_threads="$(nproc)" \ - --split_digits=true \ - --allow_whitespace_only_pieces=true \ - --byte_fallback=true \ - --unk_surface=" \342\201\207 " \ - --normalization_rule_name=identity \