From 01df3731d6747659ad4d8cf7d9f4bcb27eb6d5f0 Mon Sep 17 00:00:00 2001
From: YiMing Han <hanyiming1995@gmail.com>
Date: Fri, 18 Aug 2023 15:09:24 -0400
Subject: [PATCH] only dart

---
 .github/workflows/build.yml | 193 ------------------
 configurator.py             |  47 -----
 export_meta_llama_bin.py    | 112 -----------
 export_meta_llama_hf_bin.py | 113 -----------
 model.py                    | 392 ------------------------------------
 requirements.txt            |   7 -
 run.ipynb                   | 130 ------------
 sample.py                   |  79 --------
 save_torchscript.py         |  66 ------
 tinystories.py              | 274 -------------------------
 tokenizer.py                |  78 -------
 train.py                    | 342 -------------------------------
 train_vocab.sh              | 126 ------------
 13 files changed, 1959 deletions(-)
 delete mode 100644 .github/workflows/build.yml
 delete mode 100644 configurator.py
 delete mode 100644 export_meta_llama_bin.py
 delete mode 100644 export_meta_llama_hf_bin.py
 delete mode 100644 model.py
 delete mode 100644 requirements.txt
 delete mode 100644 run.ipynb
 delete mode 100644 sample.py
 delete mode 100755 save_torchscript.py
 delete mode 100644 tinystories.py
 delete mode 100644 tokenizer.py
 delete mode 100644 train.py
 delete mode 100755 train_vocab.sh

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
deleted file mode 100644
index 7e6474d..0000000
--- a/.github/workflows/build.yml
+++ /dev/null
@@ -1,193 +0,0 @@
-name: Continuous Integration
-
-on:
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/**', '**/Makefile', '**/*.c', '**/*.h', '**/*.py']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['**/Makefile', '**/*.c', '**/*.h', '**/*.py']
-  # for manual triggering
-  workflow_dispatch:
-
-env:
-  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-
-jobs:
-  # check basic builds to avoid breaking changes
-  ubuntu-focal-make:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential -y
-
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.10"
-
-      - name: Pip setup
-        run: |
-          python -m pip install --upgrade pip
-          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-
-      - name: Build
-        id: make_build
-        run: |
-          make
-
-      - name: Build runfast
-        id: make_build_runfast
-        run: |
-          make runfast
-
-      - name: Test with pytest
-        run: |
-          pytest
-
-  macOS-latest-make:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.10"
-
-      - name: Pip setup
-        run: |
-          python -m pip install --upgrade pip
-          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-
-      - name: Build clang
-        id: make_build_clang
-        run: |
-          make run CC=clang
-
-      - name: Build
-        id: make_build
-        run: |
-          make
-
-      - name: Build runfast
-        id: make_build_runfast
-        run: |
-          make runfast
-
-      - name: Test with pytest
-        run: pytest
-
-
-
-
-  windows-latest-make:
-    runs-on: windows-latest
-
-    strategy:
-      fail-fast: false  #necessary, otherwise the matrix breaks
-      matrix:
-        arch:
-          - amd64
-          - amd64_x86
-          - amd64_arm64
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Setup MSBuild
-        uses: microsoft/setup-msbuild@v1
-
-      - name: Setup MSVC ${{ matrix.arch }}
-        uses: ilammy/msvc-dev-cmd@v1
-        with:
-          arch: ${{ matrix.arch }}
-
-      - name: Set up Python 3.10
-        if: matrix.arch != 'amd64_arm64'
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.10"
-
-      - name: Pip setup
-        if: matrix.arch != 'amd64_arm64'
-        run: |
-          python -m pip install --upgrade pip
-          if (Test-Path requirements.txt) {
-            pip install -r requirements.txt
-          }
-
-      - name: Build ${{ matrix.arch }}
-        id: build_msvc
-        run: |
-          .\build_msvc.bat
-
-      #cross-comiled, cannot be run on host
-      - name: Test with pytest
-        if: matrix.arch != 'amd64_arm64'
-        run: pytest
-
-  windows-latest-mingw:
-    runs-on: windows-latest
-
-    defaults:
-      run:
-        shell: msys2 {0}
-
-    strategy:
-      matrix:
-        include:
-          - { sys: mingw64, env: x86_64 }
-
-    steps:
-      - name: Checkout
-        id: checkout
-        uses: actions/checkout@v3
-
-      - uses: msys2/setup-msys2@v2
-        id: setup-msys2
-        with:
-          msystem: ${{ matrix.sys }}
-          install: mingw-w64-${{matrix.env}}-gcc make
-
-      - name: Build ${{ matrix.sys }} ${{ matrix.env }}
-        id: build_mingw
-        run: |
-          make win64
-
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.10"
-
-      - name: Pip setup
-        shell: powershell
-        run: |
-          python -m pip install --upgrade pip
-          if (Test-Path requirements.txt) {
-            pip install -r requirements.txt
-          }
-
-      - name: Test with pytest
-        shell: powershell
-        run: pytest
diff --git a/configurator.py b/configurator.py
deleted file mode 100644
index a8bba95..0000000
--- a/configurator.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""
-Poor Man's Configurator. Probably a terrible idea. Example usage:
-$ python train.py config/override_file.py --batch_size=32
-this will first run config/override_file.py, then override batch_size to 32
-
-The code in this file will be run as follows from e.g. train.py:
->>> exec(open('configurator.py').read())
-
-So it's not a Python module, it's just shuttling this code away from train.py
-The code in this script then overrides the globals()
-
-I know people are not going to love this, I just really dislike configuration
-complexity and having to prepend config. to every single variable. If someone
-comes up with a better simple Python solution I am all ears.
-"""
-
-import sys
-from ast import literal_eval
-
-for arg in sys.argv[1:]:
-    if '=' not in arg:
-        # assume it's the name of a config file
-        assert not arg.startswith('--')
-        config_file = arg
-        print(f"Overriding config with {config_file}:")
-        with open(config_file) as f:
-            print(f.read())
-        exec(open(config_file).read())
-    else:
-        # assume it's a --key=value argument
-        assert arg.startswith('--')
-        key, val = arg.split('=')
-        key = key[2:]
-        if key in globals():
-            try:
-                # attempt to eval it it (e.g. if bool, number, or etc)
-                attempt = literal_eval(val)
-            except (SyntaxError, ValueError):
-                # if that goes wrong, just use the string
-                attempt = val
-            # ensure the types match ok
-            assert type(attempt) == type(globals()[key])
-            # cross fingers
-            print(f"Overriding: {key} = {attempt}")
-            globals()[key] = attempt
-        else:
-            raise ValueError(f"Unknown config key: {key}")
diff --git a/export_meta_llama_bin.py b/export_meta_llama_bin.py
deleted file mode 100644
index 4e42197..0000000
--- a/export_meta_llama_bin.py
+++ /dev/null
@@ -1,112 +0,0 @@
-"""
-This script exports the Llama 2 weights in llama2c.bin format.
-"""
-import os
-import sys
-import struct
-from pathlib import Path
-import json
-
-import torch
-
-from model import precompute_freqs_cis
-
-
-def export(p, state_dict, filepath='model.bin'):
-    """export the model weights in fp32 into .bin file to be read from C"""
-    f = open(filepath, 'wb')
-
-    def serialize(key):
-        print(f"writing {key}...")
-        t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy()
-        f.write(memoryview(t))
-        del state_dict[key]
-
-    # first write out the header
-    hidden_dim = state_dict['layers.0.feed_forward.w1.weight'].shape[0]
-    p['vocab_size'] = 32000
-    p['max_seq_len'] = 2048
-
-    n_kv_heads = p.get('n_kv_heads') or p['n_heads']
-    header = struct.pack(
-        'iiiiiii',
-        p['dim'], hidden_dim, p['n_layers'], p['n_heads'],
-        n_kv_heads, -p['vocab_size'], p['max_seq_len']
-    )
-    # NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present
-    # in the checkpoint and should be loaded.
-    f.write(header)
-
-    # next write out the embedding weights
-    print("writing tok_embeddings...")
-    serialize('tok_embeddings.weight')
-
-    # now all the layers
-    # attention weights
-    for i in range(p['n_layers']): serialize(f'layers.{i}.attention_norm.weight')
-    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wq.weight')
-    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wk.weight')
-    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wv.weight')
-    for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wo.weight')
-    # ffn weights
-    for i in range(p['n_layers']): serialize(f'layers.{i}.ffn_norm.weight')
-    for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w1.weight')
-    for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w2.weight')
-    for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w3.weight')
-
-    # final rmsnorm
-    serialize('norm.weight')
-    # freqs_cos, freqs_sin
-    freqs_cos, freqs_sin = precompute_freqs_cis(p['dim'] // p['n_heads'], p['max_seq_len'] * 2)
-    state_dict['freqs_cos'] = freqs_cos[:p['max_seq_len']]
-    state_dict['freqs_sin'] = freqs_sin[:p['max_seq_len']]
-    serialize('freqs_cos')
-    serialize('freqs_sin')
-
-    # finally write the output weights
-    serialize('output.weight')
-
-    f.close()
-    print(f"wrote {filepath}")
-
-
-def concat_weights(models):
-    state_dict = {}
-    for name in list(models[0]):
-        tensors = [model[name] for model in models]
-        if len(tensors) == 1 or len(tensors[0].shape) == 1:
-            state_dict[name] = tensors[0]
-            continue
-        is_axis_1 = (
-            name.startswith('tok_embeddings.')
-            or name.endswith('.attention.wo.weight')
-            or name.endswith('.feed_forward.w2.weight')
-        )
-        axis = 1 if is_axis_1 else 0
-        state_dict[name] = torch.cat(tensors, dim=axis)
-        for model in models:
-            del model[name]
-    return state_dict
-
-
-def load_and_export(model_path, output_path):
-    params_path = os.path.join(model_path, 'params.json')
-    with open(params_path) as f:
-        params = json.load(f)
-        print(params)
-
-    model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
-    models = [torch.load(p, map_location='cpu') for p in model_paths]
-    state_dict = concat_weights(models)
-    del models
-    export(params, state_dict, output_path)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) == 1:
-        print('[Llama model folder path] [output path]')
-        exit()
-
-    model_path = sys.argv[1]
-    output_path = sys.argv[2]
-    load_and_export(model_path, output_path)
diff --git a/export_meta_llama_hf_bin.py b/export_meta_llama_hf_bin.py
deleted file mode 100644
index e3a8c73..0000000
--- a/export_meta_llama_hf_bin.py
+++ /dev/null
@@ -1,113 +0,0 @@
-"""
-This script exports the Llama 2 weights in llama2c.bin format.
-"""
-import os
-import sys
-import struct
-from pathlib import Path
-import json
-
-import torch
-
-from model import precompute_freqs_cis
-
-
-def export(p, state_dict, filepath='model.bin'):
-    """export the model weights in fp32 into .bin file to be read from C"""
-    f = open(filepath, 'wb')
-
-    def serialize(key):
-        print(f"writing {key}...")
-        t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy()
-        f.write(memoryview(t))
-        del state_dict[key]
-
-    # first write out the header
-    hidden_dim = state_dict['model.layers.0.mlp.gate_proj.weight'].shape[0]
-    p['vocab_size'] = 32000
-    p['max_seq_len'] = 2048
-
-    n_kv_heads = p.get('n_kv_heads') or p['n_heads']
-    header = struct.pack(
-        'iiiiiii',
-        p['dim'], hidden_dim, p['n_layers'], p['n_heads'],
-        n_kv_heads, -p['vocab_size'], p['max_seq_len']
-    )
-    # NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present
-    # in the checkpoint and should be loaded.
-    f.write(header)
-
-    # next write out the embedding weights
-    print("writing tok_embeddings...")
-    serialize('model.embed_tokens.weight')
-
-    # now all the layers
-    # attention weights
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.input_layernorm.weight')
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.q_proj.weight')
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.k_proj.weight')
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.v_proj.weight')
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.o_proj.weight')
-    # ffn weights
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.post_attention_layernorm.weight')
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.gate_proj.weight')
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.down_proj.weight')
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.up_proj.weight')
-
-    # final rmsnorm
-    serialize('model.norm.weight')
-    # freqs_cos, freqs_sin
-    freqs_cos, freqs_sin = precompute_freqs_cis(p['dim'] // p['n_heads'], p['max_seq_len'] * 2)
-    state_dict['freqs_cos'] = freqs_cos[:p['max_seq_len']]
-    state_dict['freqs_sin'] = freqs_sin[:p['max_seq_len']]
-    # check if this requires addtional conversion
-    serialize('freqs_cos')
-    serialize('freqs_sin')
-
-    # finally write the output weights
-    serialize('lm_head.weight')
-
-    f.close()
-    print(f"wrote {filepath}")
-
-
-def concat_weights(models):
-    state_dict = {}
-    for name in list(models[0]):
-        tensors = [model[name] for model in models]
-        if len(tensors) == 1 or len(tensors[0].shape) == 1:
-            state_dict[name] = tensors[0]
-            continue
-        is_axis_1 = (
-            name.startswith('model.embed_tokens.weight')
-            or name.endswith('.self_attn.o_proj.weight')
-            or name.endswith('.mlp.down_proj.weight')
-        )
-        axis = 1 if is_axis_1 else 0
-        state_dict[name] = torch.cat(tensors, dim=axis)
-        for model in models:
-            del model[name]
-    return state_dict
-
-
-def load_and_export(model_path, output_path):
-    params_path = os.path.join(model_path, 'params.json')
-    with open(params_path) as f:
-        params = json.load(f)
-        print(params)
-
-    model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
-    models = [torch.load(p, map_location='cpu') for p in model_paths]
-    state_dict = concat_weights(models)
-    del models
-    export(params, state_dict, output_path)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) == 1:
-        print('[Llama model folder path] [output path]')
-        exit()
-
-    model_path = sys.argv[1]
-    output_path = sys.argv[2]
-    load_and_export(model_path, output_path)
diff --git a/model.py b/model.py
deleted file mode 100644
index c8c82a9..0000000
--- a/model.py
+++ /dev/null
@@ -1,392 +0,0 @@
-import math
-import struct
-import inspect
-from dataclasses import dataclass
-from typing import Any, Optional, Tuple
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-@dataclass
-class ModelArgs:
-    # default hyperparameters for the Llama 7B model
-    dim: int = 4096
-    n_layers: int = 32
-    n_heads: int = 32
-    n_kv_heads: Optional[int] = None
-    vocab_size: int = 32000
-    multiple_of: int = 256  # MLP hidden layer size will be multiple of
-    norm_eps: float = 1e-5
-    max_seq_len: int = 2048
-    dropout: float = 0.0
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int, eps: float):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
-    def forward(self, x):
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
-
-
-def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-    t = torch.arange(end, device=freqs.device)  # type: ignore
-    freqs = torch.outer(t, freqs).float()  # type: ignore
-    freqs_cos = torch.cos(freqs)  # real part
-    freqs_sin = torch.sin(freqs)  # imaginary part
-    return freqs_cos, freqs_sin
-
-def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
-    ndim = x.ndim
-    assert 0 <= 1 < ndim
-    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
-    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-    return freqs_cis.view(shape)
-
-def apply_rotary_emb(
-    xq: torch.Tensor,
-    xk: torch.Tensor,
-    freqs_cos: torch.Tensor,
-    freqs_sin: torch.Tensor
-) -> Tuple[torch.Tensor, torch.Tensor]:
-
-    # reshape xq and xk to match the complex representation
-    xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1)
-    xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1)
-
-    # reshape freqs_cos and freqs_sin for broadcasting
-    freqs_cos = reshape_for_broadcast(freqs_cos, xq_r)
-    freqs_sin = reshape_for_broadcast(freqs_sin, xq_r)
-
-    # apply rotation using real numbers
-    xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin
-    xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos
-    xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin
-    xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos
-
-    # flatten last two dimensions
-    xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3)
-    xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3)
-
-    return xq_out.type_as(xq), xk_out.type_as(xk)
-
-def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
-    bs, slen, n_kv_heads, head_dim = x.shape
-    if n_rep == 1:
-        return x
-    return (
-        x[:, :, :, None, :]
-        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
-        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
-    )
-
-class Attention(nn.Module):
-    def __init__(self, args: ModelArgs):
-        super().__init__()
-        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
-        assert args.n_heads % self.n_kv_heads == 0
-        model_parallel_size = 1
-        self.n_local_heads = args.n_heads // model_parallel_size
-        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
-        self.n_rep = self.n_local_heads // self.n_local_kv_heads
-        self.head_dim = args.dim // args.n_heads
-        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
-        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
-        self.attn_dropout = nn.Dropout(args.dropout)
-        self.resid_dropout = nn.Dropout(args.dropout)
-        self.dropout = args.dropout
-
-        # use flash attention or a manual implementation?
-        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
-        if not self.flash:
-            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
-            mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf"))
-            mask = torch.triu(mask, diagonal=1)
-            self.register_buffer("mask", mask)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        freqs_cos: torch.Tensor,
-        freqs_sin: torch.Tensor,
-    ):
-        bsz, seqlen, _ = x.shape
-
-        # QKV
-        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
-        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-
-        # RoPE relative positional embeddings
-        xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)
-
-        # grouped multiquery attention: expand out keys and values
-        xk = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
-        xv = repeat_kv(xv, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
-
-        # make heads into a batch dimension
-        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
-        xk = xk.transpose(1, 2)
-        xv = xv.transpose(1, 2)
-
-        # flash implementation
-        if self.flash:
-            output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None, dropout_p=self.dropout if self.training else 0.0, is_causal=True)
-        else:
-            # manual implementation
-            scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)
-            assert hasattr(self, 'mask')
-            scores = scores + self.mask[:, :, :seqlen, :seqlen]   # (bs, n_local_heads, seqlen, cache_len + seqlen)
-            scores = F.softmax(scores.float(), dim=-1).type_as(xq)
-            scores = self.attn_dropout(scores)
-            output = torch.matmul(scores, xv)  # (bs, n_local_heads, seqlen, head_dim)
-
-        # restore time as batch dimension and concat heads
-        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
-
-        # final projection into the residual stream
-        output = self.wo(output)
-        output = self.resid_dropout(output)
-        return output
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
-        super().__init__()
-        hidden_dim = int(2 * hidden_dim / 3)
-        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
-        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
-        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
-        self.dropout = nn.Dropout(dropout)
-
-    def forward(self, x):
-        return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
-
-
-class TransformerBlock(nn.Module):
-    def __init__(self, layer_id: int, args: ModelArgs):
-        super().__init__()
-        self.n_heads = args.n_heads
-        self.dim = args.dim
-        self.head_dim = args.dim // args.n_heads
-        self.attention = Attention(args)
-        self.feed_forward = FeedForward(
-            dim=args.dim,
-            hidden_dim=4 * args.dim,
-            multiple_of=args.multiple_of,
-            dropout=args.dropout,
-        )
-        self.layer_id = layer_id
-        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
-        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
-
-    def forward(self, x, freqs_cos, freqs_sin):
-        h = x + self.attention.forward(self.attention_norm(x), freqs_cos, freqs_sin)
-        out = h + self.feed_forward.forward(self.ffn_norm(h))
-        return out
-
-
-class Transformer(nn.Module):
-    last_loss: Optional[torch.Tensor]
-
-    def __init__(self, params: ModelArgs):
-        super().__init__()
-        self.params = params
-        self.vocab_size = params.vocab_size
-        self.n_layers = params.n_layers
-
-        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
-        self.dropout = nn.Dropout(params.dropout)
-        self.layers = torch.nn.ModuleList()
-        for layer_id in range(params.n_layers):
-            self.layers.append(TransformerBlock(layer_id, params))
-        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
-        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
-
-        # share the unembedding parameters with the embedding parameters
-        self.tok_embeddings.weight = self.output.weight # https://paperswithcode.com/method/weight-tying
-
-        # some useful precompute for the RoPE relative positional embeddings
-        freqs_cos, freqs_sin = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)
-        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
-        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
-
-        # init all weights
-        self.apply(self._init_weights)
-        # apply special scaled init to the residual projections, per GPT-2 paper
-        for pn, p in self.named_parameters():
-            if pn.endswith('w3.weight') or pn.endswith('wo.weight'):
-                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * params.n_layers))
-
-        # Initialize attribute for the loss of the last forward call. This will be set if the forward is called with a targets tensor.
-        self.last_loss = None
-
-    def _init_weights(self, module):
-        if isinstance(module, nn.Linear):
-            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-            if module.bias is not None:
-                torch.nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.Embedding):
-            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-
-    def forward(self, tokens: torch.Tensor, targets: Optional[torch.Tensor] = None) -> torch.Tensor:
-        _bsz, seqlen = tokens.shape
-        h = self.tok_embeddings(tokens)
-        h = self.dropout(h)
-        freqs_cos = self.freqs_cos[:seqlen]
-        freqs_sin = self.freqs_sin[:seqlen]
-
-        for layer in self.layers:
-            h = layer(h, freqs_cos, freqs_sin)
-        h = self.norm(h)
-
-        if targets is not None:
-            # if we are given some desired targets also calculate the loss
-            logits = self.output(h)
-            self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
-        else:
-            # inference-time mini-optimization: only forward the output on the very last position
-            logits = self.output(h[:, [-1], :]) # note: using list [-1] to preserve the time dim
-            self.last_loss = None
-
-        return logits
-
-    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
-        # start with all of the candidate parameters
-        param_dict = {pn: p for pn, p in self.named_parameters()}
-        # filter out those that do not require grad
-        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
-        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
-        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
-        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
-        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
-        optim_groups = [
-            {'params': decay_params, 'weight_decay': weight_decay},
-            {'params': nodecay_params, 'weight_decay': 0.0}
-        ]
-        num_decay_params = sum(p.numel() for p in decay_params)
-        num_nodecay_params = sum(p.numel() for p in nodecay_params)
-        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
-        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
-        # Create AdamW optimizer and use the fused version if it is available
-        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
-        use_fused = fused_available and device_type == 'cuda'
-        extra_args = dict(fused=True) if use_fused else dict()
-        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
-        print(f"using fused AdamW: {use_fused}")
-
-        return optimizer
-
-    def estimate_mfu(self, fwdbwd_per_iter, dt):
-        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
-        # first estimate the number of flops we do per iteration.
-        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
-        N = sum(p.numel() for p in self.parameters())
-        cfg = self.params
-        L, H, Q, T = cfg.n_layers, cfg.n_heads, cfg.dim//cfg.n_heads, cfg.max_seq_len
-        flops_per_token = 6*N + 12*L*H*Q*T
-        flops_per_fwdbwd = flops_per_token * T
-        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
-        # express our flops throughput as ratio of A100 bfloat16 peak flops
-        flops_achieved = flops_per_iter * (1.0/dt) # per second
-        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
-        mfu = flops_achieved / flops_promised
-        return mfu
-
-    @torch.inference_mode()
-    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
-        """
-        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
-        the sequence max_new_tokens times, feeding the predictions back into the model each time.
-        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
-        Also note this is a super inefficient version of sampling with no key/value cache.
-        """
-        for _ in range(max_new_tokens):
-            # if the sequence context is growing too long we must crop it at block_size
-            idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:]
-            # forward the model to get the logits for the index in the sequence
-            logits = self(idx_cond)
-            logits = logits[:, -1, :] # crop to just the final time step
-            if temperature == 0.0:
-                # "sample" the single most likely index
-                _, idx_next = torch.topk(logits, k=1, dim=-1)
-            else:
-                # pluck the logits at the final step and scale by desired temperature
-                logits = logits / temperature
-                # optionally crop the logits to only the top k options
-                if top_k is not None:
-                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-                    logits[logits < v[:, [-1]]] = -float('Inf')
-                # apply softmax to convert logits to (normalized) probabilities
-                probs = F.softmax(logits, dim=-1)
-                idx_next = torch.multinomial(probs, num_samples=1)
-            # append sampled index to the running sequence and continue
-            idx = torch.cat((idx, idx_next), dim=1)
-
-        return idx
-
-    def export(self, filepath='model.bin'):
-        """export the model weights in fp32 into .bin file to be read from C"""
-        f = open(filepath, 'wb')
-
-        def serialize(t):
-            d = t.detach().cpu().view(-1).numpy().astype(np.float32)
-            b = struct.pack(f'{len(d)}f', *d)
-            f.write(b)
-
-        # first write out the header
-        hidden_dim = self.layers[0].feed_forward.w1.weight.shape[0]
-        p = self.params
-        n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads
-        header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,
-                                       n_kv_heads, p.vocab_size, p.max_seq_len)
-        f.write(header)
-
-        # next write out the embedding weights
-        serialize(self.tok_embeddings.weight)
-
-        # now all the layers
-        # attention weights
-        for layer in self.layers:
-            serialize(layer.attention_norm.weight)
-        for layer in self.layers:
-            serialize(layer.attention.wq.weight)
-        for layer in self.layers:
-            serialize(layer.attention.wk.weight)
-        for layer in self.layers:
-            serialize(layer.attention.wv.weight)
-        for layer in self.layers:
-            serialize(layer.attention.wo.weight)
-        # ffn weights
-        for layer in self.layers:
-            serialize(layer.ffn_norm.weight)
-        for layer in self.layers:
-            serialize(layer.feed_forward.w1.weight)
-        for layer in self.layers:
-            serialize(layer.feed_forward.w2.weight)
-        for layer in self.layers:
-            serialize(layer.feed_forward.w3.weight)
-        # final rmsnorm
-        serialize(self.norm.weight)
-        # note: no need to write final classifier weights due to weight sharing
-        # freqs_cis
-        serialize(self.freqs_cos[:p.max_seq_len])
-        serialize(self.freqs_sin[:p.max_seq_len])
-
-        # write to binary file
-        f.close()
-        print(f"wrote {filepath}")
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 7187a73..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-numpy==1.23.5
-pytest==7.4.0
-Requests==2.31.0
-sentencepiece==0.1.99
-torch==2.0.1
-tqdm==4.64.1
-wandb==0.15.5
diff --git a/run.ipynb b/run.ipynb
deleted file mode 100644
index ac57593..0000000
--- a/run.ipynb
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "HLdoj4cz-xal"
-      },
-      "source": [
-        "# Run.c\n",
-        "\n",
-        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/karpathy/llama2.c/blob/master/run.ipynb)\n",
-        "\n",
-        "More details can be found in the [README.md](README.md) ."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Une3Ozlnu1B7"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Clone Project\n",
-        "\n",
-        "!git clone https://github.com/karpathy/llama2.c.git\n",
-        "%cd llama2.c"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "#@title Build\n",
-        "\n",
-        "!make runfast"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "thm0ZBrtSgoC"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Pick Your Model\n",
-        "\n",
-        "#@markdown Choose model\n",
-        "model = \"stories15M\" #@param [\"stories15M\", \"stories42M\", \"stories110M\"]\n",
-        "\n",
-        "download_url = \"\"\n",
-        "\n",
-        "if(model == \"stories15M\"):\n",
-        "  download_url = \"https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin\"\n",
-        "if(model == \"stories42M\"):\n",
-        "  download_url = \"https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.bin\"\n",
-        "if(model == \"stories110M\"):\n",
-        "  download_url = \"https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.bin\"\n",
-        "\n",
-        "print(f\"download_url: {download_url}\")\n",
-        "\n",
-        "!wget $download_url\n",
-        "\n",
-        "model_file = model + \".bin\""
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "OgAc3KjuT-NM"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Generate Stories\n",
-        "\n",
-        "# Generate args\n",
-        "max_token = 256 #@param {type:\"slider\", min:32, max:1024, step:32}\n",
-        "temperature = 0.8 #@param {type:\"slider\", min:0.0, max:1, step:0.05}\n",
-        "top_p = 0.9 #@param {type:\"slider\", min:0.0, max:1.0, step:0.05}\n",
-        "prompt = \"One day, Lily met a Shoggoth\" #@param {type:\"string\"}\n",
-        "\n",
-        "print(f\"model: {model_file}, max_token: {max_token}, temperature: {temperature}, top_p: {top_p}, prompt: {prompt}\")\n",
-        "print(f\"----------------------------\\n\")\n",
-        "\n",
-        "cmd = f'./run {model_file} -t {temperature} -p {top_p} -n {max_token} -i \"{prompt}\"'\n",
-        "!{cmd}"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "#@title Run Meta's Llama 2 models\n",
-        "\n",
-        "#@markdown input your huggingface [access token](https://huggingface.co/settings/tokens) to download Meta's Llama 2 models.\n",
-        "\n",
-        "from huggingface_hub import snapshot_download\n",
-        "\n",
-        "token = \"replace your huggingface access token\" #@param {type:\"string\"}\n",
-        "path = snapshot_download(repo_id=\"meta-llama/Llama-2-7b\",cache_dir=\"Llama-2-7b\", use_auth_token=token)\n",
-        "\n",
-        "!python export_meta_llama_bin.py $path llama2_7b.bin\n",
-        "\n",
-        "print(\"./run llama2_7b.bin\\n\")\n",
-        "!./run llama2_7b.bin"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "private_outputs": true,
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/sample.py b/sample.py
deleted file mode 100644
index d2f56ea..0000000
--- a/sample.py
+++ /dev/null
@@ -1,79 +0,0 @@
-"""
-Sample from the trained model with PyTorch
-"""
-import os
-import pickle
-from contextlib import nullcontext
-import torch
-from model import ModelArgs, Transformer
-from tokenizer import Tokenizer
-
-from tinystories import get_tokenizer_model_path
-
-# -----------------------------------------------------------------------------
-checkpoint = 'out/ckpt.pt'
-start = "" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
-num_samples = 1 # number of samples to draw
-max_new_tokens = 100 # number of tokens generated in each sample
-temperature = 1.0 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
-top_k = 300 # retain only the top_k most likely tokens, clamp others to have 0 probability
-tokenizer = "" # override the tokenizer model path
-seed = 1337
-device = 'cuda' if torch.cuda.is_available() else 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
-#dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
-dtype = "float32"
-compile = False # use PyTorch 2.0 to compile the model to be faster
-exec(open('configurator.py').read()) # overrides from command line or config file
-# -----------------------------------------------------------------------------
-
-torch.manual_seed(seed)
-torch.cuda.manual_seed(seed)
-torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
-torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
-device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
-ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
-ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
-
-# init from a model saved in a specific directory
-checkpoint_dict = torch.load(checkpoint, map_location=device)
-gptconf = ModelArgs(**checkpoint_dict['model_args'])
-model = Transformer(gptconf)
-state_dict = checkpoint_dict['model']
-unwanted_prefix = '_orig_mod.'
-for k,v in list(state_dict.items()):
-    if k.startswith(unwanted_prefix):
-        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
-model.load_state_dict(state_dict, strict=False)
-
-model.eval()
-model.to(device)
-if compile:
-    print("Compiling the model...")
-    model = torch.compile(model) # requires PyTorch 2.0 (optional)
-
-# load the tokenizer
-vocab_source = checkpoint_dict.get("vocab_source", "llama2")
-vocab_size = gptconf.vocab_size
-if tokenizer:
-    # a specific tokenizer is provided, use it
-    tokenizer_model = tokenizer
-else:
-    # let's try to find the tokenizer model automatically. bit gross here...
-    query_vocab_size = 0 if vocab_source == "llama2" else vocab_size
-    tokenizer_model = get_tokenizer_model_path(vocab_size=query_vocab_size)
-enc = Tokenizer(tokenizer_model=tokenizer_model)
-
-# encode the beginning of the prompt
-if start.startswith('FILE:'):
-    with open(start[5:], 'r', encoding='utf-8') as f:
-        start = f.read()
-start_ids = enc.encode(start, bos=True, eos=False)
-x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
-
-# run generation
-with torch.no_grad():
-    with ctx:
-        for k in range(num_samples):
-            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
-            print(enc.decode(y[0].tolist()))
-            print('---------------')
diff --git a/save_torchscript.py b/save_torchscript.py
deleted file mode 100755
index af3a299..0000000
--- a/save_torchscript.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/usr/bin/env python
-"""Saves the model as a TorchScript.
-
-Usage examples:
-    ./save_torchscript.py
-    ./save_torchscript.py --dim=300
-    ./save_torchscript.py --gzip_output=True --zero_params=True
-
-The resulting file can be loaded in C++ code and then used for training or
-inference with:
-    #include <torch/script.h>
-    torch::jit::Module module = torch::jit::load("model.pt")
-
-Note that the serialized model includes the initial parameters and with the default
-ModelArgs the file is 59M and gzips down to 55M. If you want to serialize/distribute
-the model parameters separately you can zero out the parameters before saving it and
-it will gzip down to 780K.
-"""
-import gzip
-import os
-import shutil
-from inspect import signature
-
-import torch
-
-from model import ModelArgs, Transformer
-
-# Model args config
-dim = 288
-n_layers = 6
-n_heads = 6
-n_kv_heads = n_heads
-multiple_of = 32
-max_seq_len = 256
-dropout = 0.0
-vocab_size = 32000
-norm_eps = 1e-5
-# Save config
-model_path = "model.pt"
-zero_params = False
-gzip_output = False
-# Allow config overrides
-exec(open("configurator.py").read())
-
-
-def main() -> None:
-    model_args = {k: globals()[k] for k in signature(ModelArgs).parameters}
-    model = Transformer(ModelArgs(**model_args))
-
-    # If requested zero params before saving the model. This is useful in
-    # conjunction with gzip_output.
-    if zero_params:
-        for p in model.parameters():
-            p.detach().zero_()
-
-    torch.jit.save(torch.jit.script(model), model_path)
-
-    if gzip_output:
-        with open(model_path, "rb") as f_in:
-            with gzip.open(f"{model_path}.gz", "wb") as f_out:
-                shutil.copyfileobj(f_in, f_out)
-        os.unlink(model_path)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tinystories.py b/tinystories.py
deleted file mode 100644
index 690cb02..0000000
--- a/tinystories.py
+++ /dev/null
@@ -1,274 +0,0 @@
-"""
-Download, preprocess and serve the TinyStories dataset as a DataLoader.
-"""
-
-import argparse
-import glob
-import json
-import os
-import random
-from typing import List
-from concurrent.futures import ProcessPoolExecutor
-from functools import partial
-
-import numpy as np
-import requests
-import torch
-import torch.distributed as dist
-from tqdm import tqdm
-
-from tokenizer import Tokenizer
-
-DATA_CACHE_DIR = "data"
-
-def download_file(url: str, fname: str, chunk_size=1024):
-    """Helper function to download a file from a given url"""
-    resp = requests.get(url, stream=True)
-    total = int(resp.headers.get("content-length", 0))
-    with open(fname, "wb") as file, tqdm(
-        desc=fname,
-        total=total,
-        unit="iB",
-        unit_scale=True,
-        unit_divisor=1024,
-    ) as bar:
-        for data in resp.iter_content(chunk_size=chunk_size):
-            size = file.write(data)
-            bar.update(size)
-
-
-def download():
-    """Downloads the TinyStories dataset to DATA_CACHE_DIR"""
-    os.makedirs(DATA_CACHE_DIR, exist_ok=True)
-
-    # download the TinyStories dataset, unless it's already downloaded
-    data_url = "https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz"
-    data_filename = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data.tar.gz")
-    if not os.path.exists(data_filename):
-        print(f"Downloading {data_url} to {data_filename}...")
-        download_file(data_url, data_filename)
-    else:
-        print(f"{data_filename} already exists, skipping download...")
-
-    # unpack the tar.gz file into all the data shards (json files)
-    data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
-    if not os.path.exists(data_dir):
-        os.makedirs(data_dir, exist_ok=True)
-        print(f"Unpacking {data_filename}...")
-        os.system(f"tar -xzf {data_filename} -C {data_dir}")
-    else:
-        print(f"{data_dir} already exists, skipping unpacking...")
-
-    # print a single example just for debugging and such
-    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
-    with open(shard_filenames[0], "r") as f:
-        data = json.load(f)
-    print("Download done.")
-    print(f"Number of shards: {len(shard_filenames)}")
-    print(f"Example story:\n{data[0]}")
-
-def train_vocab(vocab_size):
-    """
-    Trains a custom sentencepiece tokenizer on the TinyStories dataset.
-    The custom tokenizer files will be saved in DATA_CACHE_DIR/tok{N} directories,
-    where N is the vocab size. This is also where the pretok .bin files will go.
-    """
-    assert vocab_size > 0, "Vocab size must be positive"
-
-    # output file prefix path for sentencepiece
-    prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
-
-    # how many shards we'll use for vocab training, kept low for efficiency
-    num_shards = 10
-
-    # 1) export a large chunk of text as a single text file tiny.txt
-    tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt")
-    data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
-    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
-
-    print(f"Writing temporary file {tiny_file} with {num_shards} shards...")
-    with open(tiny_file, "w") as of:
-        for shard in tqdm(shard_filenames[:num_shards]):
-            with open(shard, "r") as f:
-                data = json.load(f)
-            for example in data:
-                text = example["story"]
-                text = text.strip()
-                of.write(text + "\n")
-    print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")
-
-    # 2) run the train_vocab.sh script that trains the sentencepiece model
-    print("Will now train the vocab with:")
-    cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}"
-    print(cmd)
-    print("OK? [y/N] ")
-    dec = input()
-    if dec.lower() != "y":
-        print("Exiting...")
-        return
-    os.system(cmd)
-
-    # 3) optional cleanup, ask the user if they'd like to delete tiny.txt
-    dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")
-    if dec.lower() == "y":
-        os.remove(tiny_file)
-        print(f"Deleted {tiny_file}")
-
-    print(f"Trained tokenizer is in {prefix}.model")
-    print("Done.")
-
-
-def process_shard(args, vocab_size):
-    shard_id, shard = args
-    tokenizer_model = get_tokenizer_model_path(vocab_size)
-    enc = Tokenizer(tokenizer_model)
-    with open(shard, "r") as f:
-        data = json.load(f)
-    all_tokens = []
-    for example in tqdm(data, position=shard_id):
-        text = example["story"]
-        text = text.strip()  # get rid of leading/trailing whitespace
-        tokens = enc.encode(text, bos=True, eos=False)  # encode the text, use BOS
-        all_tokens.extend(tokens)
-    # convert to uint16 nparray
-    all_tokens = np.array(all_tokens, dtype=np.uint16)
-    # calculate the output filename
-    if vocab_size == 0:
-        # if we're using Llama 2, just save the tokenized file in the same dir
-        tokenized_filename = shard.replace(".json", ".bin")
-    else:
-        # save .bin files into a new tok{N} directory
-        bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
-        shard_basename = os.path.basename(shard)
-        bin_basename = shard_basename.replace(".json", ".bin")
-        tokenized_filename = os.path.join(bin_dir, bin_basename)
-    # write the bytes
-    with open(tokenized_filename, "wb") as f:
-        f.write(all_tokens.tobytes())
-    # calculate the average sequence length (they are separated by BOS=1)
-    avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
-    print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}")
-
-
-def pretokenize(vocab_size):
-    # iterate the shards and tokenize all of them one by one
-    data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
-    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
-    if vocab_size > 0:
-        # .bin files will be saved into tok{N} directory, create it once here
-        bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
-        os.makedirs(bin_dir, exist_ok=True)
-
-    # process all the shards in a process pool
-    fun = partial(process_shard, vocab_size=vocab_size)
-    with ProcessPoolExecutor() as executor:
-        executor.map(fun, enumerate(shard_filenames))
-    print("Done.")
-
-
-class PretokDataset(torch.utils.data.IterableDataset):
-    """Loads pretokenized examples from disk and yields them as PyTorch tensors."""
-
-    def __init__(self, split, max_seq_len, vocab_size, vocab_source):
-        super().__init__()
-        self.split = split
-        self.max_seq_len = max_seq_len
-        self.vocab_size = vocab_size
-        self.vocab_source = vocab_source
-
-    def __iter__(self):
-        # get worker info within a DataLoader
-        worker_info = torch.utils.data.get_worker_info()
-        worker_id = worker_info.id if worker_info else 0
-        # get DDP rank info
-        rank = dist.get_rank() if dist.is_initialized() else 0
-        # combine the worker_id and worker_rank to create a unique seed for rng
-        seed = 42 + worker_id + 1337 * rank
-        rng = random.Random(seed)
-        print(f"Created a PretokDataset with rng seed {seed}")
-        if self.vocab_source == "llama2":
-            # the .bin files are right along the .json files
-            bin_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
-            shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
-        elif self.vocab_source == "custom":
-            # the .bin files are in tok{N} directory
-            bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{self.vocab_size}")
-            shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
-        # train/test split. let's use only shard 0 for test split, rest train
-        shard_filenames = shard_filenames[1:] if self.split == "train" else shard_filenames[:1]
-        while True:
-            rng.shuffle(shard_filenames)
-            for shard in shard_filenames:
-                # open the dataset for reading but keep it on disk with memmap
-                m = np.memmap(shard, dtype=np.uint16, mode="r")
-                num_batches = len(m) // self.max_seq_len
-                num_batches -= 1  # drop the last partial batch
-                assert num_batches > 0, "this shard is way too small? investigate."
-                ixs = list(range(num_batches))
-                rng.shuffle(ixs)
-                for ix in ixs:
-                    start = ix * self.max_seq_len
-                    end = start + self.max_seq_len + 1
-                    # calling .astype will copy the data into a new numpy array, now in RAM
-                    chunk = torch.from_numpy((m[start:end]).astype(np.int64))
-                    x = chunk[:-1]
-                    y = chunk[1:]
-                    yield x, y
-
-# -----------------------------------------------------------------------------
-# public interface functions
-
-def get_tokenizer_model_path(vocab_size):
-    """
-    Returns path to the sentencepiece tokenizer model for a given vocab size
-    vocab_size = 0 designates the default Llama 2 tokenizer, in that case
-    None is returned.
-    """
-    if vocab_size == 0:
-        return None
-    else:
-        return os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model")
-
-class Task:
-
-    @staticmethod
-    def iter_batches(batch_size, device, num_workers=0, **dataset_kwargs):
-        ds = PretokDataset(**dataset_kwargs)
-        dl = torch.utils.data.DataLoader(
-            ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers
-        )
-        for x, y in dl:
-            x = x.to(device, non_blocking=True)
-            y = y.to(device, non_blocking=True)
-            yield x, y
-
-# -----------------------------------------------------------------------------
-# CLI for constructing the dataset
-
-if __name__ == "__main__":
-    """
-    These stages are designed to be run in order.
-
-    To tokenize data with the Llama 2 tokenizer:
-    python tinystories.py download
-    python tinystories.py pretokenize
-
-    To tokenize data with a custom tokenizer we train ourselves with sentencepiece, e.g.:
-    python tinystories.py download
-    python tinystories.py train_vocab --vocab_size=2048
-    python tinystories.py pretokenize --vocab_size=2048
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument("stage", type=str, choices=["download", "pretokenize", "train_vocab"])
-    parser.add_argument("--vocab_size", type=int, default=0, help="pretokenization vocab size. 0 = use Llama 2 tokenizer.")
-    args = parser.parse_args()
-
-    # depending on the stage call the appropriate function
-    if args.stage == "download":
-        download()
-    elif args.stage == "train_vocab":
-        train_vocab(vocab_size=args.vocab_size)
-    elif args.stage == "pretokenize":
-        pretokenize(vocab_size=args.vocab_size)
-    else:
-        raise ValueError(f"Unknown stage {args.stage}")
diff --git a/tokenizer.py b/tokenizer.py
deleted file mode 100644
index f3c0cc3..0000000
--- a/tokenizer.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Taken from llama code and lightly modified
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
-
-import os
-import struct
-import argparse
-from typing import List
-
-from sentencepiece import SentencePieceProcessor
-
-TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
-
-class Tokenizer:
-    def __init__(self, tokenizer_model=None):
-        model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
-        assert os.path.isfile(model_path), model_path
-        self.sp_model = SentencePieceProcessor(model_file=model_path)
-        self.model_path = model_path
-
-        # BOS / EOS token IDs
-        self.n_words: int = self.sp_model.vocab_size()
-        self.bos_id: int = self.sp_model.bos_id()
-        self.eos_id: int = self.sp_model.eos_id()
-        self.pad_id: int = self.sp_model.pad_id()
-        #print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
-        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
-
-    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
-        assert type(s) is str
-        t = self.sp_model.encode(s)
-        if bos:
-            t = [self.bos_id] + t
-        if eos:
-            t = t + [self.eos_id]
-        return t
-
-    def decode(self, t: List[int]) -> str:
-        return self.sp_model.decode(t)
-
-    def export(self):
-
-        # get all the tokens (postprocessed) and their scores as floats
-        tokens, scores = [], []
-        for i in range(self.n_words):
-
-            # decode the token and light postprocessing
-            t = self.sp_model.id_to_piece(i)
-            s = self.sp_model.get_score(i)
-            if i == self.bos_id:
-                t = '\n<s>\n'
-            elif i == self.eos_id:
-                t = '\n</s>\n'
-            t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
-            b = t.encode('utf-8') # bytes of this token, utf-8 encoded
-
-            tokens.append(b)
-            scores.append(s)
-
-        # record the max token length
-        max_token_length = max(len(t) for t in tokens)
-
-        # write to a binary file
-        # the tokenizer.bin file is the same as .model file, but .bin
-        tokenizer_bin = self.model_path.replace('.model', '.bin')
-        with open(tokenizer_bin, 'wb') as f:
-            f.write(struct.pack("I", max_token_length))
-            for bytes, score in zip(tokens, scores):
-                f.write(struct.pack("fI", score, len(bytes)))
-                f.write(bytes)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-t", "--tokenizer-model", type=str, help="optional path to custom tokenizer ")
-    args = parser.parse_args()
-
-    t = Tokenizer(args.tokenizer_model)
-    t.export()
diff --git a/train.py b/train.py
deleted file mode 100644
index b1972dc..0000000
--- a/train.py
+++ /dev/null
@@ -1,342 +0,0 @@
-"""
-This training script can be run both on a single gpu in debug mode,
-and also in a larger training run with distributed data parallel (ddp).
-
-To run on a single GPU small debug run, example:
-$ python -m train.py --compile=False --eval_iters=10 --batch_size=8
-
-To run with DDP on 4 gpus on 1 node, example:
-$ torchrun --standalone --nproc_per_node=4 train.py
-
-To run with DDP on 4 gpus across 2 nodes, example:
-- Run on the first (master) node with example IP 123.456.123.456:
-$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py
-- Run on the worker node:
-$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py
-(If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1)
-"""
-
-import math
-import os
-import time
-from contextlib import nullcontext
-from datetime import datetime
-from functools import partial
-
-import torch
-from model import Transformer, ModelArgs
-from torch.distributed import destroy_process_group, init_process_group
-from torch.nn.parallel import DistributedDataParallel as DDP
-
-from tinystories import Task
-
-# -----------------------------------------------------------------------------
-# I/O
-out_dir = "out"
-eval_interval = 2000
-log_interval = 1
-eval_iters = 100
-eval_only = False  # if True, script exits right after the first eval
-always_save_checkpoint = False  # if True, always save a checkpoint after each eval
-init_from = "scratch"  # 'scratch' or 'resume'
-# wandb logging
-wandb_log = False  # disabled by default
-wandb_project = "llamac"
-wandb_run_name = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
-# data
-batch_size = 128  # if gradient_accumulation_steps > 1, this is the micro-batch size
-max_seq_len = 256
-vocab_source = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
-vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens
-# model
-dim = 288
-n_layers = 6
-n_heads = 6
-n_kv_heads = 6
-multiple_of = 32
-dropout = 0.0
-# adamw optimizer
-gradient_accumulation_steps = 4  # used to simulate larger batch sizes
-learning_rate = 5e-4  # max learning rate
-max_iters = 100000  # total number of training iterations
-weight_decay = 1e-1
-beta1 = 0.9
-beta2 = 0.95
-grad_clip = 1.0  # clip gradients at this value, or disable if == 0.0
-# learning rate decay settings
-decay_lr = True  # whether to decay the learning rate
-warmup_iters = 1000  # how many steps to warm up for
-# system
-device = "cuda"  # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
-dtype = "bfloat16"  # float32|bfloat16|float16
-compile = True  # use PyTorch 2.0 to compile the model to be faster
-# -----------------------------------------------------------------------------
-config_keys = [
-    k
-    for k, v in globals().items()
-    if not k.startswith("_") and isinstance(v, (int, float, bool, str))
-]
-exec(open("configurator.py").read())  # overrides from command line or config file
-config = {k: globals()[k] for k in config_keys}  # will be useful for logging
-# -----------------------------------------------------------------------------
-
-# fixing some hyperparams to sensible defaults
-lr_decay_iters = max_iters  # should be ~= max_iters per Chinchilla
-min_lr = 0.0  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
-
-# validating checks
-assert vocab_source in ["llama2", "custom"]
-assert vocab_source == "custom" or vocab_size == 32000, "The vocab from Meta has 32K tokens"
-
-# various inits, derived attributes, I/O setup
-ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
-if ddp:
-    init_process_group(backend="nccl")
-    ddp_rank = int(os.environ["RANK"])
-    ddp_local_rank = int(os.environ["LOCAL_RANK"])
-    ddp_world_size = int(os.environ["WORLD_SIZE"])
-    device = f"cuda:{ddp_local_rank}"
-    torch.cuda.set_device(device)
-    master_process = ddp_rank == 0  # this process will do logging, checkpointing etc.
-    seed_offset = ddp_rank  # each process gets a different seed
-    # world_size number of processes will be training simultaneously, so we can scale
-    # down the desired gradient accumulation iterations per process proportionally
-    assert gradient_accumulation_steps % ddp_world_size == 0
-    gradient_accumulation_steps //= ddp_world_size
-else:
-    # if not ddp, we are running on a single gpu, and one process
-    master_process = True
-    seed_offset = 0
-    ddp_world_size = 1
-tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len
-if master_process:
-    print(f"tokens per iteration will be: {tokens_per_iter:,}")
-    print(f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch size * {max_seq_len} max seq len")
-
-if master_process:
-    os.makedirs(out_dir, exist_ok=True)
-torch.manual_seed(1337 + seed_offset)
-torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
-torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
-device_type = "cuda" if "cuda" in device else "cpu"  # for later use in torch.autocast
-# note: float16 data type will automatically use a GradScaler
-ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[dtype]
-ctx = (
-    nullcontext()
-    if device_type == "cpu"
-    else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
-)
-
-# task-specific setup
-iter_batches = partial(
-    Task.iter_batches,
-    batch_size=batch_size,
-    max_seq_len=max_seq_len,
-    vocab_size=vocab_size,
-    vocab_source=vocab_source,
-    device=device,
-    num_workers=0,
-)
-
-# init these up here, can override if init_from='resume' (i.e. from a checkpoint)
-iter_num = 0
-best_val_loss = 1e9
-
-# model init
-model_args = dict(
-    dim=dim,
-    n_layers=n_layers,
-    n_heads=n_heads,
-    n_kv_heads=n_kv_heads,
-    vocab_size=vocab_size,
-    multiple_of=multiple_of,
-    max_seq_len=max_seq_len,
-    dropout=dropout,
-)  # start with model_args from command line
-if init_from == "scratch":
-    # init a new model from scratch
-    print("Initializing a new model from scratch")
-    gptconf = ModelArgs(**model_args)
-    model = Transformer(gptconf)
-elif init_from == "resume":
-    print(f"Resuming training from {out_dir}")
-    # resume training from a checkpoint.
-    ckpt_path = os.path.join(out_dir, "ckpt.pt")
-    checkpoint = torch.load(ckpt_path, map_location=device)
-    checkpoint_model_args = checkpoint["model_args"]
-    # force these config attributes to be equal otherwise we can't even resume training
-    # the rest of the attributes (e.g. dropout) can stay as desired from command line
-    for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]:
-        model_args[k] = checkpoint_model_args[k]
-    # create the model
-    gptconf = ModelArgs(**model_args)
-    model = Transformer(gptconf)
-    state_dict = checkpoint["model"]
-    # fix the keys of the state dictionary :(
-    # honestly no idea how checkpoints sometimes get this prefix, have to debug more
-    unwanted_prefix = "_orig_mod."
-    for k, v in list(state_dict.items()):
-        if k.startswith(unwanted_prefix):
-            state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
-    model.load_state_dict(state_dict)
-    iter_num = checkpoint["iter_num"]
-    best_val_loss = checkpoint["best_val_loss"]
-model.to(device)
-
-# initialize a GradScaler. If enabled=False scaler is a no-op
-scaler = torch.cuda.amp.GradScaler(enabled=(dtype == "float16"))
-
-# optimizer
-optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
-if init_from == "resume" and "optimizer" in checkpoint:
-    optimizer.load_state_dict(checkpoint["optimizer"])
-checkpoint = None  # free up memory
-
-# compile the model
-if compile:
-    print("compiling the model... (takes a ~minute)")
-    unoptimized_model = model
-    model = torch.compile(model)  # requires PyTorch 2.0
-
-# wrap model into DDP container
-if ddp:
-    # Ignore the `freqs_cis` buffer so that DDP does not broadcast it at
-    # construction time since NCCL does not support `ComplexFloat`
-    prefix = "_orig_mod." if compile else ""
-    model._ddp_params_and_buffers_to_ignore = {prefix + "freqs_cis"}
-    model = DDP(model, device_ids=[ddp_local_rank])
-
-# helps estimate an arbitrarily accurate loss over either split using many batches
-@torch.no_grad()
-def estimate_loss():
-    out = {}
-    model.eval()
-    for split in ["train", "val"]:
-        batch_iter = iter_batches(split=split)
-        losses = torch.zeros(eval_iters)  # keep on CPU
-        for k in range(eval_iters):
-            X, Y = next(batch_iter)
-            with ctx:
-                logits = model(X, Y)
-                loss = raw_model.last_loss
-            losses[k] = loss.item()
-        out[split] = losses.mean()
-    model.train()
-    return out
-
-# learning rate decay scheduler (cosine with warmup)
-def get_lr(it):
-    # 1) linear warmup for warmup_iters steps
-    if it < warmup_iters:
-        return learning_rate * it / warmup_iters
-    # 2) if it > lr_decay_iters, return min learning rate
-    if it > lr_decay_iters:
-        return min_lr
-    # 3) in between, use cosine decay down to min learning rate
-    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
-    assert 0 <= decay_ratio <= 1
-    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
-    return min_lr + coeff * (learning_rate - min_lr)
-
-# logging
-if wandb_log and master_process:
-    import wandb
-    wandb.init(project=wandb_project, name=wandb_run_name, config=config)
-
-# training loop
-train_batch_iter = iter_batches(split="train")
-X, Y = next(train_batch_iter)  # fetch the very first batch
-t0 = time.time()
-local_iter_num = 0  # number of iterations in the lifetime of this process
-raw_model = model.module if ddp else model  # unwrap DDP container if needed
-running_mfu = -1.0
-while True:
-    # determine and set the learning rate for this iteration
-    lr = get_lr(iter_num) if decay_lr else learning_rate
-    for param_group in optimizer.param_groups:
-        param_group["lr"] = lr
-
-    # evaluate the loss on train/val sets and write checkpoints
-    if iter_num % eval_interval == 0 and master_process:
-        losses = estimate_loss()
-        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
-        if wandb_log:
-            try:
-                wandb.log(
-                    {
-                        "iter": iter_num,
-                        "tokens": iter_num * tokens_per_iter,
-                        "loss/train": losses["train"],
-                        "loss/val": losses["val"],
-                        "lr": lr,
-                        "mfu": running_mfu * 100,  # convert to percentage
-                    }
-                )
-            except Exception as e:
-                print(f"logging to wandb failed: {e}")
-        if losses["val"] < best_val_loss or always_save_checkpoint:
-            best_val_loss = losses["val"]
-            if iter_num > 0:
-                checkpoint = {
-                    "model": raw_model.state_dict(),
-                    "optimizer": optimizer.state_dict(),
-                    "model_args": model_args,
-                    "iter_num": iter_num,
-                    "best_val_loss": best_val_loss,
-                    "config": config,
-                }
-                print(f"saving checkpoint to {out_dir}")
-                torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt"))
-                raw_model.export(os.path.join(out_dir, "model.bin"))
-    if iter_num == 0 and eval_only:
-        break
-
-    # forward backward update, with optional gradient accumulation to simulate larger batch size
-    # and using the GradScaler if data type is float16
-    for micro_step in range(gradient_accumulation_steps):
-        if ddp:
-            # in DDP training we only need to sync gradients at the last micro step.
-            # the official way to do this is with model.no_sync() context manager, but
-            # I really dislike that this bloats the code and forces us to repeat code
-            # looking at the source of that context manager, it just toggles this variable
-            model.require_backward_grad_sync = micro_step == gradient_accumulation_steps - 1
-        with ctx:
-            logits = model(X, Y)
-            loss = raw_model.last_loss
-            loss = loss / gradient_accumulation_steps
-        # immediately async prefetch next batch while model is doing the forward pass on the GPU
-        X, Y = next(train_batch_iter)
-        # backward pass, with gradient scaling if training in fp16
-        scaler.scale(loss).backward()
-    # clip the gradient
-    if grad_clip != 0.0:
-        scaler.unscale_(optimizer)
-        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
-    # step the optimizer and scaler if training in fp16
-    scaler.step(optimizer)
-    scaler.update()
-    # flush the gradients as soon as we can, no need for this memory anymore
-    optimizer.zero_grad(set_to_none=True)
-
-    # timing and logging
-    t1 = time.time()
-    dt = t1 - t0
-    t0 = t1
-    if iter_num % log_interval == 0 and master_process:
-        # get loss as float, scale up due to the divide above. note: this is a CPU-GPU sync point
-        lossf = loss.item() * gradient_accumulation_steps
-        if local_iter_num >= 5:  # let the training loop settle a bit
-            mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
-            running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
-        print(
-            f"{iter_num} | loss {lossf:.4f} | lr {lr:e} | {dt*1000:.2f}ms | mfu {running_mfu*100:.2f}%"
-        )
-    iter_num += 1
-    local_iter_num += 1
-
-    # termination conditions
-    if iter_num > max_iters:
-        break
-
-if ddp:
-    destroy_process_group()
diff --git a/train_vocab.sh b/train_vocab.sh
deleted file mode 100755
index 7803af8..0000000
--- a/train_vocab.sh
+++ /dev/null
@@ -1,126 +0,0 @@
-#!/bin/bash
-
-# Trains a sentencepiece tokenizer model on a bunch of given data, my best
-# effort attempt to replicate how Meta trained their Llama 2 tokenizer.
-
-# usage: $ train_vocab.sh <input> <model_prefix> <vocab_size>
-# example:
-# ./train_vocab.sh tiny.txt tokenizer_tiny 1024
-# requirements:
-# install https://github.com/google/sentencepiece
-
-# check if the correct number of arguments are provided
-if [ $# -ne 3 ]; then
-    echo "Usage: $0 <input> <model_prefix> <vocab_size>"
-    exit 1
-fi
-
-# assign command-line arguments to variables
-input=$1
-model_prefix=$2
-vocab_size=$3
-
-# check if input file exists
-if [ ! -f "$input" ]; then
-    echo "Usage: $0 <input> <model_prefix> <vocab_size>"
-    echo "input '$input' not found."
-    exit 1
-fi
-
-# check if vocab_size is a positive integer
-if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -lt 1 ]; then
-    echo "Usage: $0 <input> <model_prefix> <vocab_size>"
-    echo "vocab_size size must be a positive integer."
-    exit 1
-fi
-
-# Print the processed inputs
-echo "Input: $input"
-echo "Model Prefix: $model_prefix"
-echo "Vocabulary Size: $vocab_size"
-
-# train a sentencepiece tokenizer model
-# Llama 2 config can be printed as follows:
-
-# import sentencepiece.sentencepiece_model_pb2
-# mp = sentencepiece.sentencepiece_model_pb2.ModelProto()
-# mp.ParseFromString(open("tokenizer.model", "rb").read())
-# print(mp.trainer_spec)
-# print(mp.normalizer_spec)
-
-# this gives:
-
-# trainer_spec {
-#   input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
-#   model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
-#   model_type: BPE
-#   vocab_size: 32000
-#   self_test_sample_size: 0
-#   input_format: "text"
-#   character_coverage: 0.9999499917030334
-#   input_sentence_size: 200000000
-#   seed_sentencepiece_size: 1000000
-#   shrinking_factor: 0.75
-#   num_threads: 80
-#   num_sub_iterations: 2
-#   max_sentence_length: 4192
-#   shuffle_input_sentence: true
-#   max_sentencepiece_length: 16
-#   split_by_unicode_script: true
-#   split_by_whitespace: true
-#   split_by_number: true
-#   treat_whitespace_as_suffix: false
-#   split_digits: true
-#   allow_whitespace_only_pieces: true
-#   vocabulary_output_piece_score: true
-#   hard_vocab_limit: true
-#   use_all_vocab: false
-#   byte_fallback: true
-#   required_chars: ""
-#   unk_id: 0
-#   bos_id: 1
-#   eos_id: 2
-#   pad_id: -1
-#   unk_surface: " \342\201\207 "
-#   unk_piece: "<unk>"
-#   bos_piece: "<s>"
-#   eos_piece: "</s>"
-#   pad_piece: "<pad>"
-#   train_extremely_large_corpus: false
-#   enable_differential_privacy: false
-#   differential_privacy_noise_level: 0.0
-#   differential_privacy_clipping_threshold: 0
-# }
-# normalizer_spec {
-#   name: "identity"
-#   precompiled_charsmap: ""
-#   add_dummy_prefix: true
-#   remove_extra_whitespaces: false
-#   normalization_rule_tsv: ""
-# }
-
-# let's now use spm_train to train this exact model
-# options docs: https://github.com/google/sentencepiece/blob/master/doc/options.md
-
-# we'll depart on a few settings:
-# character_coverage -> 1.0
-
-# other important notes:
-# --split-digits = true, per the paper
-# --allow_whitespace_only_pieces is true, default in spm is false
-# --byte_fallback is true, default in spm is false
-# --normalization_rule_name is identity, default in spm is nmt_nfkc
-
-spm_train --input="$input" \
-          --model_prefix="$model_prefix" \
-          --model_type=bpe \
-          --vocab_size="$vocab_size" \
-          --self_test_sample_size=0 \
-          --input_format="text" \
-          --character_coverage=1.0 \
-          --num_threads="$(nproc)" \
-          --split_digits=true \
-          --allow_whitespace_only_pieces=true \
-          --byte_fallback=true \
-          --unk_surface=" \342\201\207 " \
-          --normalization_rule_name=identity \