only dart
This commit is contained in:
@@ -1,193 +0,0 @@
|
|||||||
name: Continuous Integration
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- master
|
|
||||||
paths: ['.github/workflows/**', '**/Makefile', '**/*.c', '**/*.h', '**/*.py']
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
paths: ['**/Makefile', '**/*.c', '**/*.h', '**/*.py']
|
|
||||||
# for manual triggering
|
|
||||||
workflow_dispatch:
|
|
||||||
|
|
||||||
env:
|
|
||||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
# check basic builds to avoid breaking changes
|
|
||||||
ubuntu-focal-make:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install build-essential -y
|
|
||||||
|
|
||||||
- name: Set up Python 3.10
|
|
||||||
uses: actions/setup-python@v3
|
|
||||||
with:
|
|
||||||
python-version: "3.10"
|
|
||||||
|
|
||||||
- name: Pip setup
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
id: make_build
|
|
||||||
run: |
|
|
||||||
make
|
|
||||||
|
|
||||||
- name: Build runfast
|
|
||||||
id: make_build_runfast
|
|
||||||
run: |
|
|
||||||
make runfast
|
|
||||||
|
|
||||||
- name: Test with pytest
|
|
||||||
run: |
|
|
||||||
pytest
|
|
||||||
|
|
||||||
macOS-latest-make:
|
|
||||||
runs-on: macos-latest
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
id: depends
|
|
||||||
continue-on-error: true
|
|
||||||
run: |
|
|
||||||
brew update
|
|
||||||
|
|
||||||
- name: Set up Python 3.10
|
|
||||||
uses: actions/setup-python@v3
|
|
||||||
with:
|
|
||||||
python-version: "3.10"
|
|
||||||
|
|
||||||
- name: Pip setup
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
|
||||||
|
|
||||||
- name: Build clang
|
|
||||||
id: make_build_clang
|
|
||||||
run: |
|
|
||||||
make run CC=clang
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
id: make_build
|
|
||||||
run: |
|
|
||||||
make
|
|
||||||
|
|
||||||
- name: Build runfast
|
|
||||||
id: make_build_runfast
|
|
||||||
run: |
|
|
||||||
make runfast
|
|
||||||
|
|
||||||
- name: Test with pytest
|
|
||||||
run: pytest
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
windows-latest-make:
|
|
||||||
runs-on: windows-latest
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
fail-fast: false #necessary, otherwise the matrix breaks
|
|
||||||
matrix:
|
|
||||||
arch:
|
|
||||||
- amd64
|
|
||||||
- amd64_x86
|
|
||||||
- amd64_arm64
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Clone
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
|
|
||||||
- name: Setup MSBuild
|
|
||||||
uses: microsoft/setup-msbuild@v1
|
|
||||||
|
|
||||||
- name: Setup MSVC ${{ matrix.arch }}
|
|
||||||
uses: ilammy/msvc-dev-cmd@v1
|
|
||||||
with:
|
|
||||||
arch: ${{ matrix.arch }}
|
|
||||||
|
|
||||||
- name: Set up Python 3.10
|
|
||||||
if: matrix.arch != 'amd64_arm64'
|
|
||||||
uses: actions/setup-python@v3
|
|
||||||
with:
|
|
||||||
python-version: "3.10"
|
|
||||||
|
|
||||||
- name: Pip setup
|
|
||||||
if: matrix.arch != 'amd64_arm64'
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
if (Test-Path requirements.txt) {
|
|
||||||
pip install -r requirements.txt
|
|
||||||
}
|
|
||||||
|
|
||||||
- name: Build ${{ matrix.arch }}
|
|
||||||
id: build_msvc
|
|
||||||
run: |
|
|
||||||
.\build_msvc.bat
|
|
||||||
|
|
||||||
#cross-comiled, cannot be run on host
|
|
||||||
- name: Test with pytest
|
|
||||||
if: matrix.arch != 'amd64_arm64'
|
|
||||||
run: pytest
|
|
||||||
|
|
||||||
windows-latest-mingw:
|
|
||||||
runs-on: windows-latest
|
|
||||||
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: msys2 {0}
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
include:
|
|
||||||
- { sys: mingw64, env: x86_64 }
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
id: checkout
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
|
|
||||||
- uses: msys2/setup-msys2@v2
|
|
||||||
id: setup-msys2
|
|
||||||
with:
|
|
||||||
msystem: ${{ matrix.sys }}
|
|
||||||
install: mingw-w64-${{matrix.env}}-gcc make
|
|
||||||
|
|
||||||
- name: Build ${{ matrix.sys }} ${{ matrix.env }}
|
|
||||||
id: build_mingw
|
|
||||||
run: |
|
|
||||||
make win64
|
|
||||||
|
|
||||||
- name: Set up Python 3.10
|
|
||||||
uses: actions/setup-python@v3
|
|
||||||
with:
|
|
||||||
python-version: "3.10"
|
|
||||||
|
|
||||||
- name: Pip setup
|
|
||||||
shell: powershell
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
if (Test-Path requirements.txt) {
|
|
||||||
pip install -r requirements.txt
|
|
||||||
}
|
|
||||||
|
|
||||||
- name: Test with pytest
|
|
||||||
shell: powershell
|
|
||||||
run: pytest
|
|
||||||
@@ -1,47 +0,0 @@
|
|||||||
"""
|
|
||||||
Poor Man's Configurator. Probably a terrible idea. Example usage:
|
|
||||||
$ python train.py config/override_file.py --batch_size=32
|
|
||||||
this will first run config/override_file.py, then override batch_size to 32
|
|
||||||
|
|
||||||
The code in this file will be run as follows from e.g. train.py:
|
|
||||||
>>> exec(open('configurator.py').read())
|
|
||||||
|
|
||||||
So it's not a Python module, it's just shuttling this code away from train.py
|
|
||||||
The code in this script then overrides the globals()
|
|
||||||
|
|
||||||
I know people are not going to love this, I just really dislike configuration
|
|
||||||
complexity and having to prepend config. to every single variable. If someone
|
|
||||||
comes up with a better simple Python solution I am all ears.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import sys
|
|
||||||
from ast import literal_eval
|
|
||||||
|
|
||||||
for arg in sys.argv[1:]:
|
|
||||||
if '=' not in arg:
|
|
||||||
# assume it's the name of a config file
|
|
||||||
assert not arg.startswith('--')
|
|
||||||
config_file = arg
|
|
||||||
print(f"Overriding config with {config_file}:")
|
|
||||||
with open(config_file) as f:
|
|
||||||
print(f.read())
|
|
||||||
exec(open(config_file).read())
|
|
||||||
else:
|
|
||||||
# assume it's a --key=value argument
|
|
||||||
assert arg.startswith('--')
|
|
||||||
key, val = arg.split('=')
|
|
||||||
key = key[2:]
|
|
||||||
if key in globals():
|
|
||||||
try:
|
|
||||||
# attempt to eval it it (e.g. if bool, number, or etc)
|
|
||||||
attempt = literal_eval(val)
|
|
||||||
except (SyntaxError, ValueError):
|
|
||||||
# if that goes wrong, just use the string
|
|
||||||
attempt = val
|
|
||||||
# ensure the types match ok
|
|
||||||
assert type(attempt) == type(globals()[key])
|
|
||||||
# cross fingers
|
|
||||||
print(f"Overriding: {key} = {attempt}")
|
|
||||||
globals()[key] = attempt
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown config key: {key}")
|
|
||||||
@@ -1,112 +0,0 @@
|
|||||||
"""
|
|
||||||
This script exports the Llama 2 weights in llama2c.bin format.
|
|
||||||
"""
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import struct
|
|
||||||
from pathlib import Path
|
|
||||||
import json
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from model import precompute_freqs_cis
|
|
||||||
|
|
||||||
|
|
||||||
def export(p, state_dict, filepath='model.bin'):
|
|
||||||
"""export the model weights in fp32 into .bin file to be read from C"""
|
|
||||||
f = open(filepath, 'wb')
|
|
||||||
|
|
||||||
def serialize(key):
|
|
||||||
print(f"writing {key}...")
|
|
||||||
t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy()
|
|
||||||
f.write(memoryview(t))
|
|
||||||
del state_dict[key]
|
|
||||||
|
|
||||||
# first write out the header
|
|
||||||
hidden_dim = state_dict['layers.0.feed_forward.w1.weight'].shape[0]
|
|
||||||
p['vocab_size'] = 32000
|
|
||||||
p['max_seq_len'] = 2048
|
|
||||||
|
|
||||||
n_kv_heads = p.get('n_kv_heads') or p['n_heads']
|
|
||||||
header = struct.pack(
|
|
||||||
'iiiiiii',
|
|
||||||
p['dim'], hidden_dim, p['n_layers'], p['n_heads'],
|
|
||||||
n_kv_heads, -p['vocab_size'], p['max_seq_len']
|
|
||||||
)
|
|
||||||
# NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present
|
|
||||||
# in the checkpoint and should be loaded.
|
|
||||||
f.write(header)
|
|
||||||
|
|
||||||
# next write out the embedding weights
|
|
||||||
print("writing tok_embeddings...")
|
|
||||||
serialize('tok_embeddings.weight')
|
|
||||||
|
|
||||||
# now all the layers
|
|
||||||
# attention weights
|
|
||||||
for i in range(p['n_layers']): serialize(f'layers.{i}.attention_norm.weight')
|
|
||||||
for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wq.weight')
|
|
||||||
for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wk.weight')
|
|
||||||
for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wv.weight')
|
|
||||||
for i in range(p['n_layers']): serialize(f'layers.{i}.attention.wo.weight')
|
|
||||||
# ffn weights
|
|
||||||
for i in range(p['n_layers']): serialize(f'layers.{i}.ffn_norm.weight')
|
|
||||||
for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w1.weight')
|
|
||||||
for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w2.weight')
|
|
||||||
for i in range(p['n_layers']): serialize(f'layers.{i}.feed_forward.w3.weight')
|
|
||||||
|
|
||||||
# final rmsnorm
|
|
||||||
serialize('norm.weight')
|
|
||||||
# freqs_cos, freqs_sin
|
|
||||||
freqs_cos, freqs_sin = precompute_freqs_cis(p['dim'] // p['n_heads'], p['max_seq_len'] * 2)
|
|
||||||
state_dict['freqs_cos'] = freqs_cos[:p['max_seq_len']]
|
|
||||||
state_dict['freqs_sin'] = freqs_sin[:p['max_seq_len']]
|
|
||||||
serialize('freqs_cos')
|
|
||||||
serialize('freqs_sin')
|
|
||||||
|
|
||||||
# finally write the output weights
|
|
||||||
serialize('output.weight')
|
|
||||||
|
|
||||||
f.close()
|
|
||||||
print(f"wrote {filepath}")
|
|
||||||
|
|
||||||
|
|
||||||
def concat_weights(models):
|
|
||||||
state_dict = {}
|
|
||||||
for name in list(models[0]):
|
|
||||||
tensors = [model[name] for model in models]
|
|
||||||
if len(tensors) == 1 or len(tensors[0].shape) == 1:
|
|
||||||
state_dict[name] = tensors[0]
|
|
||||||
continue
|
|
||||||
is_axis_1 = (
|
|
||||||
name.startswith('tok_embeddings.')
|
|
||||||
or name.endswith('.attention.wo.weight')
|
|
||||||
or name.endswith('.feed_forward.w2.weight')
|
|
||||||
)
|
|
||||||
axis = 1 if is_axis_1 else 0
|
|
||||||
state_dict[name] = torch.cat(tensors, dim=axis)
|
|
||||||
for model in models:
|
|
||||||
del model[name]
|
|
||||||
return state_dict
|
|
||||||
|
|
||||||
|
|
||||||
def load_and_export(model_path, output_path):
|
|
||||||
params_path = os.path.join(model_path, 'params.json')
|
|
||||||
with open(params_path) as f:
|
|
||||||
params = json.load(f)
|
|
||||||
print(params)
|
|
||||||
|
|
||||||
model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
|
|
||||||
models = [torch.load(p, map_location='cpu') for p in model_paths]
|
|
||||||
state_dict = concat_weights(models)
|
|
||||||
del models
|
|
||||||
export(params, state_dict, output_path)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
if len(sys.argv) == 1:
|
|
||||||
print('[Llama model folder path] [output path]')
|
|
||||||
exit()
|
|
||||||
|
|
||||||
model_path = sys.argv[1]
|
|
||||||
output_path = sys.argv[2]
|
|
||||||
load_and_export(model_path, output_path)
|
|
||||||
@@ -1,113 +0,0 @@
|
|||||||
"""
|
|
||||||
This script exports the Llama 2 weights in llama2c.bin format.
|
|
||||||
"""
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import struct
|
|
||||||
from pathlib import Path
|
|
||||||
import json
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from model import precompute_freqs_cis
|
|
||||||
|
|
||||||
|
|
||||||
def export(p, state_dict, filepath='model.bin'):
|
|
||||||
"""export the model weights in fp32 into .bin file to be read from C"""
|
|
||||||
f = open(filepath, 'wb')
|
|
||||||
|
|
||||||
def serialize(key):
|
|
||||||
print(f"writing {key}...")
|
|
||||||
t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy()
|
|
||||||
f.write(memoryview(t))
|
|
||||||
del state_dict[key]
|
|
||||||
|
|
||||||
# first write out the header
|
|
||||||
hidden_dim = state_dict['model.layers.0.mlp.gate_proj.weight'].shape[0]
|
|
||||||
p['vocab_size'] = 32000
|
|
||||||
p['max_seq_len'] = 2048
|
|
||||||
|
|
||||||
n_kv_heads = p.get('n_kv_heads') or p['n_heads']
|
|
||||||
header = struct.pack(
|
|
||||||
'iiiiiii',
|
|
||||||
p['dim'], hidden_dim, p['n_layers'], p['n_heads'],
|
|
||||||
n_kv_heads, -p['vocab_size'], p['max_seq_len']
|
|
||||||
)
|
|
||||||
# NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present
|
|
||||||
# in the checkpoint and should be loaded.
|
|
||||||
f.write(header)
|
|
||||||
|
|
||||||
# next write out the embedding weights
|
|
||||||
print("writing tok_embeddings...")
|
|
||||||
serialize('model.embed_tokens.weight')
|
|
||||||
|
|
||||||
# now all the layers
|
|
||||||
# attention weights
|
|
||||||
for i in range(p['n_layers']): serialize(f'model.layers.{i}.input_layernorm.weight')
|
|
||||||
for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.q_proj.weight')
|
|
||||||
for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.k_proj.weight')
|
|
||||||
for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.v_proj.weight')
|
|
||||||
for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.o_proj.weight')
|
|
||||||
# ffn weights
|
|
||||||
for i in range(p['n_layers']): serialize(f'model.layers.{i}.post_attention_layernorm.weight')
|
|
||||||
for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.gate_proj.weight')
|
|
||||||
for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.down_proj.weight')
|
|
||||||
for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.up_proj.weight')
|
|
||||||
|
|
||||||
# final rmsnorm
|
|
||||||
serialize('model.norm.weight')
|
|
||||||
# freqs_cos, freqs_sin
|
|
||||||
freqs_cos, freqs_sin = precompute_freqs_cis(p['dim'] // p['n_heads'], p['max_seq_len'] * 2)
|
|
||||||
state_dict['freqs_cos'] = freqs_cos[:p['max_seq_len']]
|
|
||||||
state_dict['freqs_sin'] = freqs_sin[:p['max_seq_len']]
|
|
||||||
# check if this requires addtional conversion
|
|
||||||
serialize('freqs_cos')
|
|
||||||
serialize('freqs_sin')
|
|
||||||
|
|
||||||
# finally write the output weights
|
|
||||||
serialize('lm_head.weight')
|
|
||||||
|
|
||||||
f.close()
|
|
||||||
print(f"wrote {filepath}")
|
|
||||||
|
|
||||||
|
|
||||||
def concat_weights(models):
|
|
||||||
state_dict = {}
|
|
||||||
for name in list(models[0]):
|
|
||||||
tensors = [model[name] for model in models]
|
|
||||||
if len(tensors) == 1 or len(tensors[0].shape) == 1:
|
|
||||||
state_dict[name] = tensors[0]
|
|
||||||
continue
|
|
||||||
is_axis_1 = (
|
|
||||||
name.startswith('model.embed_tokens.weight')
|
|
||||||
or name.endswith('.self_attn.o_proj.weight')
|
|
||||||
or name.endswith('.mlp.down_proj.weight')
|
|
||||||
)
|
|
||||||
axis = 1 if is_axis_1 else 0
|
|
||||||
state_dict[name] = torch.cat(tensors, dim=axis)
|
|
||||||
for model in models:
|
|
||||||
del model[name]
|
|
||||||
return state_dict
|
|
||||||
|
|
||||||
|
|
||||||
def load_and_export(model_path, output_path):
|
|
||||||
params_path = os.path.join(model_path, 'params.json')
|
|
||||||
with open(params_path) as f:
|
|
||||||
params = json.load(f)
|
|
||||||
print(params)
|
|
||||||
|
|
||||||
model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
|
|
||||||
models = [torch.load(p, map_location='cpu') for p in model_paths]
|
|
||||||
state_dict = concat_weights(models)
|
|
||||||
del models
|
|
||||||
export(params, state_dict, output_path)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
if len(sys.argv) == 1:
|
|
||||||
print('[Llama model folder path] [output path]')
|
|
||||||
exit()
|
|
||||||
|
|
||||||
model_path = sys.argv[1]
|
|
||||||
output_path = sys.argv[2]
|
|
||||||
load_and_export(model_path, output_path)
|
|
||||||
@@ -1,392 +0,0 @@
|
|||||||
import math
|
|
||||||
import struct
|
|
||||||
import inspect
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import Any, Optional, Tuple
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
import torch.nn.functional as F
|
|
||||||
from torch import nn
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ModelArgs:
|
|
||||||
# default hyperparameters for the Llama 7B model
|
|
||||||
dim: int = 4096
|
|
||||||
n_layers: int = 32
|
|
||||||
n_heads: int = 32
|
|
||||||
n_kv_heads: Optional[int] = None
|
|
||||||
vocab_size: int = 32000
|
|
||||||
multiple_of: int = 256 # MLP hidden layer size will be multiple of
|
|
||||||
norm_eps: float = 1e-5
|
|
||||||
max_seq_len: int = 2048
|
|
||||||
dropout: float = 0.0
|
|
||||||
|
|
||||||
|
|
||||||
class RMSNorm(torch.nn.Module):
|
|
||||||
def __init__(self, dim: int, eps: float):
|
|
||||||
super().__init__()
|
|
||||||
self.eps = eps
|
|
||||||
self.weight = nn.Parameter(torch.ones(dim))
|
|
||||||
|
|
||||||
def _norm(self, x):
|
|
||||||
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
output = self._norm(x.float()).type_as(x)
|
|
||||||
return output * self.weight
|
|
||||||
|
|
||||||
|
|
||||||
def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
|
|
||||||
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
|
|
||||||
t = torch.arange(end, device=freqs.device) # type: ignore
|
|
||||||
freqs = torch.outer(t, freqs).float() # type: ignore
|
|
||||||
freqs_cos = torch.cos(freqs) # real part
|
|
||||||
freqs_sin = torch.sin(freqs) # imaginary part
|
|
||||||
return freqs_cos, freqs_sin
|
|
||||||
|
|
||||||
def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
|
|
||||||
ndim = x.ndim
|
|
||||||
assert 0 <= 1 < ndim
|
|
||||||
assert freqs_cis.shape == (x.shape[1], x.shape[-1])
|
|
||||||
shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
|
|
||||||
return freqs_cis.view(shape)
|
|
||||||
|
|
||||||
def apply_rotary_emb(
|
|
||||||
xq: torch.Tensor,
|
|
||||||
xk: torch.Tensor,
|
|
||||||
freqs_cos: torch.Tensor,
|
|
||||||
freqs_sin: torch.Tensor
|
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
||||||
|
|
||||||
# reshape xq and xk to match the complex representation
|
|
||||||
xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1)
|
|
||||||
xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1)
|
|
||||||
|
|
||||||
# reshape freqs_cos and freqs_sin for broadcasting
|
|
||||||
freqs_cos = reshape_for_broadcast(freqs_cos, xq_r)
|
|
||||||
freqs_sin = reshape_for_broadcast(freqs_sin, xq_r)
|
|
||||||
|
|
||||||
# apply rotation using real numbers
|
|
||||||
xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin
|
|
||||||
xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos
|
|
||||||
xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin
|
|
||||||
xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos
|
|
||||||
|
|
||||||
# flatten last two dimensions
|
|
||||||
xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3)
|
|
||||||
xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3)
|
|
||||||
|
|
||||||
return xq_out.type_as(xq), xk_out.type_as(xk)
|
|
||||||
|
|
||||||
def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
|
|
||||||
"""torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
|
|
||||||
bs, slen, n_kv_heads, head_dim = x.shape
|
|
||||||
if n_rep == 1:
|
|
||||||
return x
|
|
||||||
return (
|
|
||||||
x[:, :, :, None, :]
|
|
||||||
.expand(bs, slen, n_kv_heads, n_rep, head_dim)
|
|
||||||
.reshape(bs, slen, n_kv_heads * n_rep, head_dim)
|
|
||||||
)
|
|
||||||
|
|
||||||
class Attention(nn.Module):
|
|
||||||
def __init__(self, args: ModelArgs):
|
|
||||||
super().__init__()
|
|
||||||
self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
|
|
||||||
assert args.n_heads % self.n_kv_heads == 0
|
|
||||||
model_parallel_size = 1
|
|
||||||
self.n_local_heads = args.n_heads // model_parallel_size
|
|
||||||
self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
|
|
||||||
self.n_rep = self.n_local_heads // self.n_local_kv_heads
|
|
||||||
self.head_dim = args.dim // args.n_heads
|
|
||||||
self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
|
|
||||||
self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
|
|
||||||
self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
|
|
||||||
self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
|
|
||||||
self.attn_dropout = nn.Dropout(args.dropout)
|
|
||||||
self.resid_dropout = nn.Dropout(args.dropout)
|
|
||||||
self.dropout = args.dropout
|
|
||||||
|
|
||||||
# use flash attention or a manual implementation?
|
|
||||||
self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
|
|
||||||
if not self.flash:
|
|
||||||
print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
|
|
||||||
mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf"))
|
|
||||||
mask = torch.triu(mask, diagonal=1)
|
|
||||||
self.register_buffer("mask", mask)
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
x: torch.Tensor,
|
|
||||||
freqs_cos: torch.Tensor,
|
|
||||||
freqs_sin: torch.Tensor,
|
|
||||||
):
|
|
||||||
bsz, seqlen, _ = x.shape
|
|
||||||
|
|
||||||
# QKV
|
|
||||||
xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
|
|
||||||
xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
|
|
||||||
xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
|
|
||||||
xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
|
|
||||||
|
|
||||||
# RoPE relative positional embeddings
|
|
||||||
xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)
|
|
||||||
|
|
||||||
# grouped multiquery attention: expand out keys and values
|
|
||||||
xk = repeat_kv(xk, self.n_rep) # (bs, seqlen, n_local_heads, head_dim)
|
|
||||||
xv = repeat_kv(xv, self.n_rep) # (bs, seqlen, n_local_heads, head_dim)
|
|
||||||
|
|
||||||
# make heads into a batch dimension
|
|
||||||
xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim)
|
|
||||||
xk = xk.transpose(1, 2)
|
|
||||||
xv = xv.transpose(1, 2)
|
|
||||||
|
|
||||||
# flash implementation
|
|
||||||
if self.flash:
|
|
||||||
output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None, dropout_p=self.dropout if self.training else 0.0, is_causal=True)
|
|
||||||
else:
|
|
||||||
# manual implementation
|
|
||||||
scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)
|
|
||||||
assert hasattr(self, 'mask')
|
|
||||||
scores = scores + self.mask[:, :, :seqlen, :seqlen] # (bs, n_local_heads, seqlen, cache_len + seqlen)
|
|
||||||
scores = F.softmax(scores.float(), dim=-1).type_as(xq)
|
|
||||||
scores = self.attn_dropout(scores)
|
|
||||||
output = torch.matmul(scores, xv) # (bs, n_local_heads, seqlen, head_dim)
|
|
||||||
|
|
||||||
# restore time as batch dimension and concat heads
|
|
||||||
output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
|
|
||||||
|
|
||||||
# final projection into the residual stream
|
|
||||||
output = self.wo(output)
|
|
||||||
output = self.resid_dropout(output)
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
class FeedForward(nn.Module):
|
|
||||||
def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
|
|
||||||
super().__init__()
|
|
||||||
hidden_dim = int(2 * hidden_dim / 3)
|
|
||||||
hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
|
||||||
self.w1 = nn.Linear(dim, hidden_dim, bias=False)
|
|
||||||
self.w2 = nn.Linear(hidden_dim, dim, bias=False)
|
|
||||||
self.w3 = nn.Linear(dim, hidden_dim, bias=False)
|
|
||||||
self.dropout = nn.Dropout(dropout)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
|
|
||||||
|
|
||||||
|
|
||||||
class TransformerBlock(nn.Module):
|
|
||||||
def __init__(self, layer_id: int, args: ModelArgs):
|
|
||||||
super().__init__()
|
|
||||||
self.n_heads = args.n_heads
|
|
||||||
self.dim = args.dim
|
|
||||||
self.head_dim = args.dim // args.n_heads
|
|
||||||
self.attention = Attention(args)
|
|
||||||
self.feed_forward = FeedForward(
|
|
||||||
dim=args.dim,
|
|
||||||
hidden_dim=4 * args.dim,
|
|
||||||
multiple_of=args.multiple_of,
|
|
||||||
dropout=args.dropout,
|
|
||||||
)
|
|
||||||
self.layer_id = layer_id
|
|
||||||
self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
|
|
||||||
self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
|
|
||||||
|
|
||||||
def forward(self, x, freqs_cos, freqs_sin):
|
|
||||||
h = x + self.attention.forward(self.attention_norm(x), freqs_cos, freqs_sin)
|
|
||||||
out = h + self.feed_forward.forward(self.ffn_norm(h))
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
class Transformer(nn.Module):
|
|
||||||
last_loss: Optional[torch.Tensor]
|
|
||||||
|
|
||||||
def __init__(self, params: ModelArgs):
|
|
||||||
super().__init__()
|
|
||||||
self.params = params
|
|
||||||
self.vocab_size = params.vocab_size
|
|
||||||
self.n_layers = params.n_layers
|
|
||||||
|
|
||||||
self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
|
|
||||||
self.dropout = nn.Dropout(params.dropout)
|
|
||||||
self.layers = torch.nn.ModuleList()
|
|
||||||
for layer_id in range(params.n_layers):
|
|
||||||
self.layers.append(TransformerBlock(layer_id, params))
|
|
||||||
self.norm = RMSNorm(params.dim, eps=params.norm_eps)
|
|
||||||
self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
|
|
||||||
|
|
||||||
# share the unembedding parameters with the embedding parameters
|
|
||||||
self.tok_embeddings.weight = self.output.weight # https://paperswithcode.com/method/weight-tying
|
|
||||||
|
|
||||||
# some useful precompute for the RoPE relative positional embeddings
|
|
||||||
freqs_cos, freqs_sin = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)
|
|
||||||
self.register_buffer("freqs_cos", freqs_cos, persistent=False)
|
|
||||||
self.register_buffer("freqs_sin", freqs_sin, persistent=False)
|
|
||||||
|
|
||||||
# init all weights
|
|
||||||
self.apply(self._init_weights)
|
|
||||||
# apply special scaled init to the residual projections, per GPT-2 paper
|
|
||||||
for pn, p in self.named_parameters():
|
|
||||||
if pn.endswith('w3.weight') or pn.endswith('wo.weight'):
|
|
||||||
torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * params.n_layers))
|
|
||||||
|
|
||||||
# Initialize attribute for the loss of the last forward call. This will be set if the forward is called with a targets tensor.
|
|
||||||
self.last_loss = None
|
|
||||||
|
|
||||||
def _init_weights(self, module):
|
|
||||||
if isinstance(module, nn.Linear):
|
|
||||||
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
|
||||||
if module.bias is not None:
|
|
||||||
torch.nn.init.zeros_(module.bias)
|
|
||||||
elif isinstance(module, nn.Embedding):
|
|
||||||
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
|
||||||
|
|
||||||
def forward(self, tokens: torch.Tensor, targets: Optional[torch.Tensor] = None) -> torch.Tensor:
|
|
||||||
_bsz, seqlen = tokens.shape
|
|
||||||
h = self.tok_embeddings(tokens)
|
|
||||||
h = self.dropout(h)
|
|
||||||
freqs_cos = self.freqs_cos[:seqlen]
|
|
||||||
freqs_sin = self.freqs_sin[:seqlen]
|
|
||||||
|
|
||||||
for layer in self.layers:
|
|
||||||
h = layer(h, freqs_cos, freqs_sin)
|
|
||||||
h = self.norm(h)
|
|
||||||
|
|
||||||
if targets is not None:
|
|
||||||
# if we are given some desired targets also calculate the loss
|
|
||||||
logits = self.output(h)
|
|
||||||
self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
|
|
||||||
else:
|
|
||||||
# inference-time mini-optimization: only forward the output on the very last position
|
|
||||||
logits = self.output(h[:, [-1], :]) # note: using list [-1] to preserve the time dim
|
|
||||||
self.last_loss = None
|
|
||||||
|
|
||||||
return logits
|
|
||||||
|
|
||||||
def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
|
|
||||||
# start with all of the candidate parameters
|
|
||||||
param_dict = {pn: p for pn, p in self.named_parameters()}
|
|
||||||
# filter out those that do not require grad
|
|
||||||
param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
|
|
||||||
# create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
|
|
||||||
# i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
|
|
||||||
decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
|
|
||||||
nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
|
|
||||||
optim_groups = [
|
|
||||||
{'params': decay_params, 'weight_decay': weight_decay},
|
|
||||||
{'params': nodecay_params, 'weight_decay': 0.0}
|
|
||||||
]
|
|
||||||
num_decay_params = sum(p.numel() for p in decay_params)
|
|
||||||
num_nodecay_params = sum(p.numel() for p in nodecay_params)
|
|
||||||
print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
|
|
||||||
print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
|
|
||||||
# Create AdamW optimizer and use the fused version if it is available
|
|
||||||
fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
|
|
||||||
use_fused = fused_available and device_type == 'cuda'
|
|
||||||
extra_args = dict(fused=True) if use_fused else dict()
|
|
||||||
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
|
|
||||||
print(f"using fused AdamW: {use_fused}")
|
|
||||||
|
|
||||||
return optimizer
|
|
||||||
|
|
||||||
def estimate_mfu(self, fwdbwd_per_iter, dt):
|
|
||||||
""" estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
|
|
||||||
# first estimate the number of flops we do per iteration.
|
|
||||||
# see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
|
|
||||||
N = sum(p.numel() for p in self.parameters())
|
|
||||||
cfg = self.params
|
|
||||||
L, H, Q, T = cfg.n_layers, cfg.n_heads, cfg.dim//cfg.n_heads, cfg.max_seq_len
|
|
||||||
flops_per_token = 6*N + 12*L*H*Q*T
|
|
||||||
flops_per_fwdbwd = flops_per_token * T
|
|
||||||
flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
|
|
||||||
# express our flops throughput as ratio of A100 bfloat16 peak flops
|
|
||||||
flops_achieved = flops_per_iter * (1.0/dt) # per second
|
|
||||||
flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
|
|
||||||
mfu = flops_achieved / flops_promised
|
|
||||||
return mfu
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
|
||||||
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
|
|
||||||
"""
|
|
||||||
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
|
|
||||||
the sequence max_new_tokens times, feeding the predictions back into the model each time.
|
|
||||||
Most likely you'll want to make sure to be in model.eval() mode of operation for this.
|
|
||||||
Also note this is a super inefficient version of sampling with no key/value cache.
|
|
||||||
"""
|
|
||||||
for _ in range(max_new_tokens):
|
|
||||||
# if the sequence context is growing too long we must crop it at block_size
|
|
||||||
idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:]
|
|
||||||
# forward the model to get the logits for the index in the sequence
|
|
||||||
logits = self(idx_cond)
|
|
||||||
logits = logits[:, -1, :] # crop to just the final time step
|
|
||||||
if temperature == 0.0:
|
|
||||||
# "sample" the single most likely index
|
|
||||||
_, idx_next = torch.topk(logits, k=1, dim=-1)
|
|
||||||
else:
|
|
||||||
# pluck the logits at the final step and scale by desired temperature
|
|
||||||
logits = logits / temperature
|
|
||||||
# optionally crop the logits to only the top k options
|
|
||||||
if top_k is not None:
|
|
||||||
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
|
||||||
logits[logits < v[:, [-1]]] = -float('Inf')
|
|
||||||
# apply softmax to convert logits to (normalized) probabilities
|
|
||||||
probs = F.softmax(logits, dim=-1)
|
|
||||||
idx_next = torch.multinomial(probs, num_samples=1)
|
|
||||||
# append sampled index to the running sequence and continue
|
|
||||||
idx = torch.cat((idx, idx_next), dim=1)
|
|
||||||
|
|
||||||
return idx
|
|
||||||
|
|
||||||
def export(self, filepath='model.bin'):
|
|
||||||
"""export the model weights in fp32 into .bin file to be read from C"""
|
|
||||||
f = open(filepath, 'wb')
|
|
||||||
|
|
||||||
def serialize(t):
|
|
||||||
d = t.detach().cpu().view(-1).numpy().astype(np.float32)
|
|
||||||
b = struct.pack(f'{len(d)}f', *d)
|
|
||||||
f.write(b)
|
|
||||||
|
|
||||||
# first write out the header
|
|
||||||
hidden_dim = self.layers[0].feed_forward.w1.weight.shape[0]
|
|
||||||
p = self.params
|
|
||||||
n_kv_heads = p.n_heads if p.n_kv_heads is None else p.n_kv_heads
|
|
||||||
header = struct.pack('iiiiiii', p.dim, hidden_dim, p.n_layers, p.n_heads,
|
|
||||||
n_kv_heads, p.vocab_size, p.max_seq_len)
|
|
||||||
f.write(header)
|
|
||||||
|
|
||||||
# next write out the embedding weights
|
|
||||||
serialize(self.tok_embeddings.weight)
|
|
||||||
|
|
||||||
# now all the layers
|
|
||||||
# attention weights
|
|
||||||
for layer in self.layers:
|
|
||||||
serialize(layer.attention_norm.weight)
|
|
||||||
for layer in self.layers:
|
|
||||||
serialize(layer.attention.wq.weight)
|
|
||||||
for layer in self.layers:
|
|
||||||
serialize(layer.attention.wk.weight)
|
|
||||||
for layer in self.layers:
|
|
||||||
serialize(layer.attention.wv.weight)
|
|
||||||
for layer in self.layers:
|
|
||||||
serialize(layer.attention.wo.weight)
|
|
||||||
# ffn weights
|
|
||||||
for layer in self.layers:
|
|
||||||
serialize(layer.ffn_norm.weight)
|
|
||||||
for layer in self.layers:
|
|
||||||
serialize(layer.feed_forward.w1.weight)
|
|
||||||
for layer in self.layers:
|
|
||||||
serialize(layer.feed_forward.w2.weight)
|
|
||||||
for layer in self.layers:
|
|
||||||
serialize(layer.feed_forward.w3.weight)
|
|
||||||
# final rmsnorm
|
|
||||||
serialize(self.norm.weight)
|
|
||||||
# note: no need to write final classifier weights due to weight sharing
|
|
||||||
# freqs_cis
|
|
||||||
serialize(self.freqs_cos[:p.max_seq_len])
|
|
||||||
serialize(self.freqs_sin[:p.max_seq_len])
|
|
||||||
|
|
||||||
# write to binary file
|
|
||||||
f.close()
|
|
||||||
print(f"wrote {filepath}")
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
numpy==1.23.5
|
|
||||||
pytest==7.4.0
|
|
||||||
Requests==2.31.0
|
|
||||||
sentencepiece==0.1.99
|
|
||||||
torch==2.0.1
|
|
||||||
tqdm==4.64.1
|
|
||||||
wandb==0.15.5
|
|
||||||
@@ -1,130 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"id": "HLdoj4cz-xal"
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"# Run.c\n",
|
|
||||||
"\n",
|
|
||||||
"[](https://colab.research.google.com/github/karpathy/llama2.c/blob/master/run.ipynb)\n",
|
|
||||||
"\n",
|
|
||||||
"More details can be found in the [README.md](README.md) ."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "Une3Ozlnu1B7"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#@title Clone Project\n",
|
|
||||||
"\n",
|
|
||||||
"!git clone https://github.com/karpathy/llama2.c.git\n",
|
|
||||||
"%cd llama2.c"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#@title Build\n",
|
|
||||||
"\n",
|
|
||||||
"!make runfast"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "thm0ZBrtSgoC"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#@title Pick Your Model\n",
|
|
||||||
"\n",
|
|
||||||
"#@markdown Choose model\n",
|
|
||||||
"model = \"stories15M\" #@param [\"stories15M\", \"stories42M\", \"stories110M\"]\n",
|
|
||||||
"\n",
|
|
||||||
"download_url = \"\"\n",
|
|
||||||
"\n",
|
|
||||||
"if(model == \"stories15M\"):\n",
|
|
||||||
" download_url = \"https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin\"\n",
|
|
||||||
"if(model == \"stories42M\"):\n",
|
|
||||||
" download_url = \"https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.bin\"\n",
|
|
||||||
"if(model == \"stories110M\"):\n",
|
|
||||||
" download_url = \"https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.bin\"\n",
|
|
||||||
"\n",
|
|
||||||
"print(f\"download_url: {download_url}\")\n",
|
|
||||||
"\n",
|
|
||||||
"!wget $download_url\n",
|
|
||||||
"\n",
|
|
||||||
"model_file = model + \".bin\""
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"id": "OgAc3KjuT-NM"
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#@title Generate Stories\n",
|
|
||||||
"\n",
|
|
||||||
"# Generate args\n",
|
|
||||||
"max_token = 256 #@param {type:\"slider\", min:32, max:1024, step:32}\n",
|
|
||||||
"temperature = 0.8 #@param {type:\"slider\", min:0.0, max:1, step:0.05}\n",
|
|
||||||
"top_p = 0.9 #@param {type:\"slider\", min:0.0, max:1.0, step:0.05}\n",
|
|
||||||
"prompt = \"One day, Lily met a Shoggoth\" #@param {type:\"string\"}\n",
|
|
||||||
"\n",
|
|
||||||
"print(f\"model: {model_file}, max_token: {max_token}, temperature: {temperature}, top_p: {top_p}, prompt: {prompt}\")\n",
|
|
||||||
"print(f\"----------------------------\\n\")\n",
|
|
||||||
"\n",
|
|
||||||
"cmd = f'./run {model_file} -t {temperature} -p {top_p} -n {max_token} -i \"{prompt}\"'\n",
|
|
||||||
"!{cmd}"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#@title Run Meta's Llama 2 models\n",
|
|
||||||
"\n",
|
|
||||||
"#@markdown input your huggingface [access token](https://huggingface.co/settings/tokens) to download Meta's Llama 2 models.\n",
|
|
||||||
"\n",
|
|
||||||
"from huggingface_hub import snapshot_download\n",
|
|
||||||
"\n",
|
|
||||||
"token = \"replace your huggingface access token\" #@param {type:\"string\"}\n",
|
|
||||||
"path = snapshot_download(repo_id=\"meta-llama/Llama-2-7b\",cache_dir=\"Llama-2-7b\", use_auth_token=token)\n",
|
|
||||||
"\n",
|
|
||||||
"!python export_meta_llama_bin.py $path llama2_7b.bin\n",
|
|
||||||
"\n",
|
|
||||||
"print(\"./run llama2_7b.bin\\n\")\n",
|
|
||||||
"!./run llama2_7b.bin"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"private_outputs": true,
|
|
||||||
"provenance": []
|
|
||||||
},
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"name": "python"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 0
|
|
||||||
}
|
|
||||||
@@ -1,79 +0,0 @@
|
|||||||
"""
|
|
||||||
Sample from the trained model with PyTorch
|
|
||||||
"""
|
|
||||||
import os
|
|
||||||
import pickle
|
|
||||||
from contextlib import nullcontext
|
|
||||||
import torch
|
|
||||||
from model import ModelArgs, Transformer
|
|
||||||
from tokenizer import Tokenizer
|
|
||||||
|
|
||||||
from tinystories import get_tokenizer_model_path
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
checkpoint = 'out/ckpt.pt'
|
|
||||||
start = "" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
|
|
||||||
num_samples = 1 # number of samples to draw
|
|
||||||
max_new_tokens = 100 # number of tokens generated in each sample
|
|
||||||
temperature = 1.0 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
|
|
||||||
top_k = 300 # retain only the top_k most likely tokens, clamp others to have 0 probability
|
|
||||||
tokenizer = "" # override the tokenizer model path
|
|
||||||
seed = 1337
|
|
||||||
device = 'cuda' if torch.cuda.is_available() else 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
|
|
||||||
#dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
|
|
||||||
dtype = "float32"
|
|
||||||
compile = False # use PyTorch 2.0 to compile the model to be faster
|
|
||||||
exec(open('configurator.py').read()) # overrides from command line or config file
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
torch.manual_seed(seed)
|
|
||||||
torch.cuda.manual_seed(seed)
|
|
||||||
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
|
||||||
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
|
||||||
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
|
|
||||||
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
|
||||||
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
|
|
||||||
|
|
||||||
# init from a model saved in a specific directory
|
|
||||||
checkpoint_dict = torch.load(checkpoint, map_location=device)
|
|
||||||
gptconf = ModelArgs(**checkpoint_dict['model_args'])
|
|
||||||
model = Transformer(gptconf)
|
|
||||||
state_dict = checkpoint_dict['model']
|
|
||||||
unwanted_prefix = '_orig_mod.'
|
|
||||||
for k,v in list(state_dict.items()):
|
|
||||||
if k.startswith(unwanted_prefix):
|
|
||||||
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
|
|
||||||
model.load_state_dict(state_dict, strict=False)
|
|
||||||
|
|
||||||
model.eval()
|
|
||||||
model.to(device)
|
|
||||||
if compile:
|
|
||||||
print("Compiling the model...")
|
|
||||||
model = torch.compile(model) # requires PyTorch 2.0 (optional)
|
|
||||||
|
|
||||||
# load the tokenizer
|
|
||||||
vocab_source = checkpoint_dict.get("vocab_source", "llama2")
|
|
||||||
vocab_size = gptconf.vocab_size
|
|
||||||
if tokenizer:
|
|
||||||
# a specific tokenizer is provided, use it
|
|
||||||
tokenizer_model = tokenizer
|
|
||||||
else:
|
|
||||||
# let's try to find the tokenizer model automatically. bit gross here...
|
|
||||||
query_vocab_size = 0 if vocab_source == "llama2" else vocab_size
|
|
||||||
tokenizer_model = get_tokenizer_model_path(vocab_size=query_vocab_size)
|
|
||||||
enc = Tokenizer(tokenizer_model=tokenizer_model)
|
|
||||||
|
|
||||||
# encode the beginning of the prompt
|
|
||||||
if start.startswith('FILE:'):
|
|
||||||
with open(start[5:], 'r', encoding='utf-8') as f:
|
|
||||||
start = f.read()
|
|
||||||
start_ids = enc.encode(start, bos=True, eos=False)
|
|
||||||
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
|
|
||||||
|
|
||||||
# run generation
|
|
||||||
with torch.no_grad():
|
|
||||||
with ctx:
|
|
||||||
for k in range(num_samples):
|
|
||||||
y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
|
|
||||||
print(enc.decode(y[0].tolist()))
|
|
||||||
print('---------------')
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
"""Saves the model as a TorchScript.
|
|
||||||
|
|
||||||
Usage examples:
|
|
||||||
./save_torchscript.py
|
|
||||||
./save_torchscript.py --dim=300
|
|
||||||
./save_torchscript.py --gzip_output=True --zero_params=True
|
|
||||||
|
|
||||||
The resulting file can be loaded in C++ code and then used for training or
|
|
||||||
inference with:
|
|
||||||
#include <torch/script.h>
|
|
||||||
torch::jit::Module module = torch::jit::load("model.pt")
|
|
||||||
|
|
||||||
Note that the serialized model includes the initial parameters and with the default
|
|
||||||
ModelArgs the file is 59M and gzips down to 55M. If you want to serialize/distribute
|
|
||||||
the model parameters separately you can zero out the parameters before saving it and
|
|
||||||
it will gzip down to 780K.
|
|
||||||
"""
|
|
||||||
import gzip
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
from inspect import signature
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from model import ModelArgs, Transformer
|
|
||||||
|
|
||||||
# Model args config
|
|
||||||
dim = 288
|
|
||||||
n_layers = 6
|
|
||||||
n_heads = 6
|
|
||||||
n_kv_heads = n_heads
|
|
||||||
multiple_of = 32
|
|
||||||
max_seq_len = 256
|
|
||||||
dropout = 0.0
|
|
||||||
vocab_size = 32000
|
|
||||||
norm_eps = 1e-5
|
|
||||||
# Save config
|
|
||||||
model_path = "model.pt"
|
|
||||||
zero_params = False
|
|
||||||
gzip_output = False
|
|
||||||
# Allow config overrides
|
|
||||||
exec(open("configurator.py").read())
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
model_args = {k: globals()[k] for k in signature(ModelArgs).parameters}
|
|
||||||
model = Transformer(ModelArgs(**model_args))
|
|
||||||
|
|
||||||
# If requested zero params before saving the model. This is useful in
|
|
||||||
# conjunction with gzip_output.
|
|
||||||
if zero_params:
|
|
||||||
for p in model.parameters():
|
|
||||||
p.detach().zero_()
|
|
||||||
|
|
||||||
torch.jit.save(torch.jit.script(model), model_path)
|
|
||||||
|
|
||||||
if gzip_output:
|
|
||||||
with open(model_path, "rb") as f_in:
|
|
||||||
with gzip.open(f"{model_path}.gz", "wb") as f_out:
|
|
||||||
shutil.copyfileobj(f_in, f_out)
|
|
||||||
os.unlink(model_path)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
-274
@@ -1,274 +0,0 @@
|
|||||||
"""
|
|
||||||
Download, preprocess and serve the TinyStories dataset as a DataLoader.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import glob
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
from typing import List
|
|
||||||
from concurrent.futures import ProcessPoolExecutor
|
|
||||||
from functools import partial
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import requests
|
|
||||||
import torch
|
|
||||||
import torch.distributed as dist
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from tokenizer import Tokenizer
|
|
||||||
|
|
||||||
DATA_CACHE_DIR = "data"
|
|
||||||
|
|
||||||
def download_file(url: str, fname: str, chunk_size=1024):
|
|
||||||
"""Helper function to download a file from a given url"""
|
|
||||||
resp = requests.get(url, stream=True)
|
|
||||||
total = int(resp.headers.get("content-length", 0))
|
|
||||||
with open(fname, "wb") as file, tqdm(
|
|
||||||
desc=fname,
|
|
||||||
total=total,
|
|
||||||
unit="iB",
|
|
||||||
unit_scale=True,
|
|
||||||
unit_divisor=1024,
|
|
||||||
) as bar:
|
|
||||||
for data in resp.iter_content(chunk_size=chunk_size):
|
|
||||||
size = file.write(data)
|
|
||||||
bar.update(size)
|
|
||||||
|
|
||||||
|
|
||||||
def download():
|
|
||||||
"""Downloads the TinyStories dataset to DATA_CACHE_DIR"""
|
|
||||||
os.makedirs(DATA_CACHE_DIR, exist_ok=True)
|
|
||||||
|
|
||||||
# download the TinyStories dataset, unless it's already downloaded
|
|
||||||
data_url = "https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz"
|
|
||||||
data_filename = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data.tar.gz")
|
|
||||||
if not os.path.exists(data_filename):
|
|
||||||
print(f"Downloading {data_url} to {data_filename}...")
|
|
||||||
download_file(data_url, data_filename)
|
|
||||||
else:
|
|
||||||
print(f"{data_filename} already exists, skipping download...")
|
|
||||||
|
|
||||||
# unpack the tar.gz file into all the data shards (json files)
|
|
||||||
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
|
|
||||||
if not os.path.exists(data_dir):
|
|
||||||
os.makedirs(data_dir, exist_ok=True)
|
|
||||||
print(f"Unpacking {data_filename}...")
|
|
||||||
os.system(f"tar -xzf {data_filename} -C {data_dir}")
|
|
||||||
else:
|
|
||||||
print(f"{data_dir} already exists, skipping unpacking...")
|
|
||||||
|
|
||||||
# print a single example just for debugging and such
|
|
||||||
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
|
|
||||||
with open(shard_filenames[0], "r") as f:
|
|
||||||
data = json.load(f)
|
|
||||||
print("Download done.")
|
|
||||||
print(f"Number of shards: {len(shard_filenames)}")
|
|
||||||
print(f"Example story:\n{data[0]}")
|
|
||||||
|
|
||||||
def train_vocab(vocab_size):
|
|
||||||
"""
|
|
||||||
Trains a custom sentencepiece tokenizer on the TinyStories dataset.
|
|
||||||
The custom tokenizer files will be saved in DATA_CACHE_DIR/tok{N} directories,
|
|
||||||
where N is the vocab size. This is also where the pretok .bin files will go.
|
|
||||||
"""
|
|
||||||
assert vocab_size > 0, "Vocab size must be positive"
|
|
||||||
|
|
||||||
# output file prefix path for sentencepiece
|
|
||||||
prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
|
|
||||||
|
|
||||||
# how many shards we'll use for vocab training, kept low for efficiency
|
|
||||||
num_shards = 10
|
|
||||||
|
|
||||||
# 1) export a large chunk of text as a single text file tiny.txt
|
|
||||||
tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt")
|
|
||||||
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
|
|
||||||
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
|
|
||||||
|
|
||||||
print(f"Writing temporary file {tiny_file} with {num_shards} shards...")
|
|
||||||
with open(tiny_file, "w") as of:
|
|
||||||
for shard in tqdm(shard_filenames[:num_shards]):
|
|
||||||
with open(shard, "r") as f:
|
|
||||||
data = json.load(f)
|
|
||||||
for example in data:
|
|
||||||
text = example["story"]
|
|
||||||
text = text.strip()
|
|
||||||
of.write(text + "\n")
|
|
||||||
print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")
|
|
||||||
|
|
||||||
# 2) run the train_vocab.sh script that trains the sentencepiece model
|
|
||||||
print("Will now train the vocab with:")
|
|
||||||
cmd = f"bash train_vocab.sh {tiny_file} {prefix} {vocab_size}"
|
|
||||||
print(cmd)
|
|
||||||
print("OK? [y/N] ")
|
|
||||||
dec = input()
|
|
||||||
if dec.lower() != "y":
|
|
||||||
print("Exiting...")
|
|
||||||
return
|
|
||||||
os.system(cmd)
|
|
||||||
|
|
||||||
# 3) optional cleanup, ask the user if they'd like to delete tiny.txt
|
|
||||||
dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")
|
|
||||||
if dec.lower() == "y":
|
|
||||||
os.remove(tiny_file)
|
|
||||||
print(f"Deleted {tiny_file}")
|
|
||||||
|
|
||||||
print(f"Trained tokenizer is in {prefix}.model")
|
|
||||||
print("Done.")
|
|
||||||
|
|
||||||
|
|
||||||
def process_shard(args, vocab_size):
|
|
||||||
shard_id, shard = args
|
|
||||||
tokenizer_model = get_tokenizer_model_path(vocab_size)
|
|
||||||
enc = Tokenizer(tokenizer_model)
|
|
||||||
with open(shard, "r") as f:
|
|
||||||
data = json.load(f)
|
|
||||||
all_tokens = []
|
|
||||||
for example in tqdm(data, position=shard_id):
|
|
||||||
text = example["story"]
|
|
||||||
text = text.strip() # get rid of leading/trailing whitespace
|
|
||||||
tokens = enc.encode(text, bos=True, eos=False) # encode the text, use BOS
|
|
||||||
all_tokens.extend(tokens)
|
|
||||||
# convert to uint16 nparray
|
|
||||||
all_tokens = np.array(all_tokens, dtype=np.uint16)
|
|
||||||
# calculate the output filename
|
|
||||||
if vocab_size == 0:
|
|
||||||
# if we're using Llama 2, just save the tokenized file in the same dir
|
|
||||||
tokenized_filename = shard.replace(".json", ".bin")
|
|
||||||
else:
|
|
||||||
# save .bin files into a new tok{N} directory
|
|
||||||
bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
|
|
||||||
shard_basename = os.path.basename(shard)
|
|
||||||
bin_basename = shard_basename.replace(".json", ".bin")
|
|
||||||
tokenized_filename = os.path.join(bin_dir, bin_basename)
|
|
||||||
# write the bytes
|
|
||||||
with open(tokenized_filename, "wb") as f:
|
|
||||||
f.write(all_tokens.tobytes())
|
|
||||||
# calculate the average sequence length (they are separated by BOS=1)
|
|
||||||
avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
|
|
||||||
print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}")
|
|
||||||
|
|
||||||
|
|
||||||
def pretokenize(vocab_size):
|
|
||||||
# iterate the shards and tokenize all of them one by one
|
|
||||||
data_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
|
|
||||||
shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
|
|
||||||
if vocab_size > 0:
|
|
||||||
# .bin files will be saved into tok{N} directory, create it once here
|
|
||||||
bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
|
|
||||||
os.makedirs(bin_dir, exist_ok=True)
|
|
||||||
|
|
||||||
# process all the shards in a process pool
|
|
||||||
fun = partial(process_shard, vocab_size=vocab_size)
|
|
||||||
with ProcessPoolExecutor() as executor:
|
|
||||||
executor.map(fun, enumerate(shard_filenames))
|
|
||||||
print("Done.")
|
|
||||||
|
|
||||||
|
|
||||||
class PretokDataset(torch.utils.data.IterableDataset):
|
|
||||||
"""Loads pretokenized examples from disk and yields them as PyTorch tensors."""
|
|
||||||
|
|
||||||
def __init__(self, split, max_seq_len, vocab_size, vocab_source):
|
|
||||||
super().__init__()
|
|
||||||
self.split = split
|
|
||||||
self.max_seq_len = max_seq_len
|
|
||||||
self.vocab_size = vocab_size
|
|
||||||
self.vocab_source = vocab_source
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
# get worker info within a DataLoader
|
|
||||||
worker_info = torch.utils.data.get_worker_info()
|
|
||||||
worker_id = worker_info.id if worker_info else 0
|
|
||||||
# get DDP rank info
|
|
||||||
rank = dist.get_rank() if dist.is_initialized() else 0
|
|
||||||
# combine the worker_id and worker_rank to create a unique seed for rng
|
|
||||||
seed = 42 + worker_id + 1337 * rank
|
|
||||||
rng = random.Random(seed)
|
|
||||||
print(f"Created a PretokDataset with rng seed {seed}")
|
|
||||||
if self.vocab_source == "llama2":
|
|
||||||
# the .bin files are right along the .json files
|
|
||||||
bin_dir = os.path.join(DATA_CACHE_DIR, "TinyStories_all_data")
|
|
||||||
shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
|
|
||||||
elif self.vocab_source == "custom":
|
|
||||||
# the .bin files are in tok{N} directory
|
|
||||||
bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{self.vocab_size}")
|
|
||||||
shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
|
|
||||||
# train/test split. let's use only shard 0 for test split, rest train
|
|
||||||
shard_filenames = shard_filenames[1:] if self.split == "train" else shard_filenames[:1]
|
|
||||||
while True:
|
|
||||||
rng.shuffle(shard_filenames)
|
|
||||||
for shard in shard_filenames:
|
|
||||||
# open the dataset for reading but keep it on disk with memmap
|
|
||||||
m = np.memmap(shard, dtype=np.uint16, mode="r")
|
|
||||||
num_batches = len(m) // self.max_seq_len
|
|
||||||
num_batches -= 1 # drop the last partial batch
|
|
||||||
assert num_batches > 0, "this shard is way too small? investigate."
|
|
||||||
ixs = list(range(num_batches))
|
|
||||||
rng.shuffle(ixs)
|
|
||||||
for ix in ixs:
|
|
||||||
start = ix * self.max_seq_len
|
|
||||||
end = start + self.max_seq_len + 1
|
|
||||||
# calling .astype will copy the data into a new numpy array, now in RAM
|
|
||||||
chunk = torch.from_numpy((m[start:end]).astype(np.int64))
|
|
||||||
x = chunk[:-1]
|
|
||||||
y = chunk[1:]
|
|
||||||
yield x, y
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# public interface functions
|
|
||||||
|
|
||||||
def get_tokenizer_model_path(vocab_size):
|
|
||||||
"""
|
|
||||||
Returns path to the sentencepiece tokenizer model for a given vocab size
|
|
||||||
vocab_size = 0 designates the default Llama 2 tokenizer, in that case
|
|
||||||
None is returned.
|
|
||||||
"""
|
|
||||||
if vocab_size == 0:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
return os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model")
|
|
||||||
|
|
||||||
class Task:
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def iter_batches(batch_size, device, num_workers=0, **dataset_kwargs):
|
|
||||||
ds = PretokDataset(**dataset_kwargs)
|
|
||||||
dl = torch.utils.data.DataLoader(
|
|
||||||
ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers
|
|
||||||
)
|
|
||||||
for x, y in dl:
|
|
||||||
x = x.to(device, non_blocking=True)
|
|
||||||
y = y.to(device, non_blocking=True)
|
|
||||||
yield x, y
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# CLI for constructing the dataset
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
"""
|
|
||||||
These stages are designed to be run in order.
|
|
||||||
|
|
||||||
To tokenize data with the Llama 2 tokenizer:
|
|
||||||
python tinystories.py download
|
|
||||||
python tinystories.py pretokenize
|
|
||||||
|
|
||||||
To tokenize data with a custom tokenizer we train ourselves with sentencepiece, e.g.:
|
|
||||||
python tinystories.py download
|
|
||||||
python tinystories.py train_vocab --vocab_size=2048
|
|
||||||
python tinystories.py pretokenize --vocab_size=2048
|
|
||||||
"""
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("stage", type=str, choices=["download", "pretokenize", "train_vocab"])
|
|
||||||
parser.add_argument("--vocab_size", type=int, default=0, help="pretokenization vocab size. 0 = use Llama 2 tokenizer.")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# depending on the stage call the appropriate function
|
|
||||||
if args.stage == "download":
|
|
||||||
download()
|
|
||||||
elif args.stage == "train_vocab":
|
|
||||||
train_vocab(vocab_size=args.vocab_size)
|
|
||||||
elif args.stage == "pretokenize":
|
|
||||||
pretokenize(vocab_size=args.vocab_size)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown stage {args.stage}")
|
|
||||||
@@ -1,78 +0,0 @@
|
|||||||
# Taken from llama code and lightly modified
|
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
|
|
||||||
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
import argparse
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from sentencepiece import SentencePieceProcessor
|
|
||||||
|
|
||||||
TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepiece tokenizer model
|
|
||||||
|
|
||||||
class Tokenizer:
|
|
||||||
def __init__(self, tokenizer_model=None):
|
|
||||||
model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
|
|
||||||
assert os.path.isfile(model_path), model_path
|
|
||||||
self.sp_model = SentencePieceProcessor(model_file=model_path)
|
|
||||||
self.model_path = model_path
|
|
||||||
|
|
||||||
# BOS / EOS token IDs
|
|
||||||
self.n_words: int = self.sp_model.vocab_size()
|
|
||||||
self.bos_id: int = self.sp_model.bos_id()
|
|
||||||
self.eos_id: int = self.sp_model.eos_id()
|
|
||||||
self.pad_id: int = self.sp_model.pad_id()
|
|
||||||
#print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
|
|
||||||
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
|
|
||||||
|
|
||||||
def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
|
|
||||||
assert type(s) is str
|
|
||||||
t = self.sp_model.encode(s)
|
|
||||||
if bos:
|
|
||||||
t = [self.bos_id] + t
|
|
||||||
if eos:
|
|
||||||
t = t + [self.eos_id]
|
|
||||||
return t
|
|
||||||
|
|
||||||
def decode(self, t: List[int]) -> str:
|
|
||||||
return self.sp_model.decode(t)
|
|
||||||
|
|
||||||
def export(self):
|
|
||||||
|
|
||||||
# get all the tokens (postprocessed) and their scores as floats
|
|
||||||
tokens, scores = [], []
|
|
||||||
for i in range(self.n_words):
|
|
||||||
|
|
||||||
# decode the token and light postprocessing
|
|
||||||
t = self.sp_model.id_to_piece(i)
|
|
||||||
s = self.sp_model.get_score(i)
|
|
||||||
if i == self.bos_id:
|
|
||||||
t = '\n<s>\n'
|
|
||||||
elif i == self.eos_id:
|
|
||||||
t = '\n</s>\n'
|
|
||||||
t = t.replace('▁', ' ') # sentencepiece uses this character as whitespace
|
|
||||||
b = t.encode('utf-8') # bytes of this token, utf-8 encoded
|
|
||||||
|
|
||||||
tokens.append(b)
|
|
||||||
scores.append(s)
|
|
||||||
|
|
||||||
# record the max token length
|
|
||||||
max_token_length = max(len(t) for t in tokens)
|
|
||||||
|
|
||||||
# write to a binary file
|
|
||||||
# the tokenizer.bin file is the same as .model file, but .bin
|
|
||||||
tokenizer_bin = self.model_path.replace('.model', '.bin')
|
|
||||||
with open(tokenizer_bin, 'wb') as f:
|
|
||||||
f.write(struct.pack("I", max_token_length))
|
|
||||||
for bytes, score in zip(tokens, scores):
|
|
||||||
f.write(struct.pack("fI", score, len(bytes)))
|
|
||||||
f.write(bytes)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("-t", "--tokenizer-model", type=str, help="optional path to custom tokenizer ")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
t = Tokenizer(args.tokenizer_model)
|
|
||||||
t.export()
|
|
||||||
@@ -1,342 +0,0 @@
|
|||||||
"""
|
|
||||||
This training script can be run both on a single gpu in debug mode,
|
|
||||||
and also in a larger training run with distributed data parallel (ddp).
|
|
||||||
|
|
||||||
To run on a single GPU small debug run, example:
|
|
||||||
$ python -m train.py --compile=False --eval_iters=10 --batch_size=8
|
|
||||||
|
|
||||||
To run with DDP on 4 gpus on 1 node, example:
|
|
||||||
$ torchrun --standalone --nproc_per_node=4 train.py
|
|
||||||
|
|
||||||
To run with DDP on 4 gpus across 2 nodes, example:
|
|
||||||
- Run on the first (master) node with example IP 123.456.123.456:
|
|
||||||
$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py
|
|
||||||
- Run on the worker node:
|
|
||||||
$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py
|
|
||||||
(If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1)
|
|
||||||
"""
|
|
||||||
|
|
||||||
import math
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
from contextlib import nullcontext
|
|
||||||
from datetime import datetime
|
|
||||||
from functools import partial
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from model import Transformer, ModelArgs
|
|
||||||
from torch.distributed import destroy_process_group, init_process_group
|
|
||||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
|
||||||
|
|
||||||
from tinystories import Task
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# I/O
|
|
||||||
out_dir = "out"
|
|
||||||
eval_interval = 2000
|
|
||||||
log_interval = 1
|
|
||||||
eval_iters = 100
|
|
||||||
eval_only = False # if True, script exits right after the first eval
|
|
||||||
always_save_checkpoint = False # if True, always save a checkpoint after each eval
|
|
||||||
init_from = "scratch" # 'scratch' or 'resume'
|
|
||||||
# wandb logging
|
|
||||||
wandb_log = False # disabled by default
|
|
||||||
wandb_project = "llamac"
|
|
||||||
wandb_run_name = "run" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
|
|
||||||
# data
|
|
||||||
batch_size = 128 # if gradient_accumulation_steps > 1, this is the micro-batch size
|
|
||||||
max_seq_len = 256
|
|
||||||
vocab_source = "llama2" # llama2|custom; use Lllama 2 vocab from Meta, or custom trained
|
|
||||||
vocab_size = 32000 # the Llama 2 tokenizer has 32K tokens
|
|
||||||
# model
|
|
||||||
dim = 288
|
|
||||||
n_layers = 6
|
|
||||||
n_heads = 6
|
|
||||||
n_kv_heads = 6
|
|
||||||
multiple_of = 32
|
|
||||||
dropout = 0.0
|
|
||||||
# adamw optimizer
|
|
||||||
gradient_accumulation_steps = 4 # used to simulate larger batch sizes
|
|
||||||
learning_rate = 5e-4 # max learning rate
|
|
||||||
max_iters = 100000 # total number of training iterations
|
|
||||||
weight_decay = 1e-1
|
|
||||||
beta1 = 0.9
|
|
||||||
beta2 = 0.95
|
|
||||||
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
|
|
||||||
# learning rate decay settings
|
|
||||||
decay_lr = True # whether to decay the learning rate
|
|
||||||
warmup_iters = 1000 # how many steps to warm up for
|
|
||||||
# system
|
|
||||||
device = "cuda" # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
|
|
||||||
dtype = "bfloat16" # float32|bfloat16|float16
|
|
||||||
compile = True # use PyTorch 2.0 to compile the model to be faster
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
config_keys = [
|
|
||||||
k
|
|
||||||
for k, v in globals().items()
|
|
||||||
if not k.startswith("_") and isinstance(v, (int, float, bool, str))
|
|
||||||
]
|
|
||||||
exec(open("configurator.py").read()) # overrides from command line or config file
|
|
||||||
config = {k: globals()[k] for k in config_keys} # will be useful for logging
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
# fixing some hyperparams to sensible defaults
|
|
||||||
lr_decay_iters = max_iters # should be ~= max_iters per Chinchilla
|
|
||||||
min_lr = 0.0 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
|
|
||||||
|
|
||||||
# validating checks
|
|
||||||
assert vocab_source in ["llama2", "custom"]
|
|
||||||
assert vocab_source == "custom" or vocab_size == 32000, "The vocab from Meta has 32K tokens"
|
|
||||||
|
|
||||||
# various inits, derived attributes, I/O setup
|
|
||||||
ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run?
|
|
||||||
if ddp:
|
|
||||||
init_process_group(backend="nccl")
|
|
||||||
ddp_rank = int(os.environ["RANK"])
|
|
||||||
ddp_local_rank = int(os.environ["LOCAL_RANK"])
|
|
||||||
ddp_world_size = int(os.environ["WORLD_SIZE"])
|
|
||||||
device = f"cuda:{ddp_local_rank}"
|
|
||||||
torch.cuda.set_device(device)
|
|
||||||
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
|
|
||||||
seed_offset = ddp_rank # each process gets a different seed
|
|
||||||
# world_size number of processes will be training simultaneously, so we can scale
|
|
||||||
# down the desired gradient accumulation iterations per process proportionally
|
|
||||||
assert gradient_accumulation_steps % ddp_world_size == 0
|
|
||||||
gradient_accumulation_steps //= ddp_world_size
|
|
||||||
else:
|
|
||||||
# if not ddp, we are running on a single gpu, and one process
|
|
||||||
master_process = True
|
|
||||||
seed_offset = 0
|
|
||||||
ddp_world_size = 1
|
|
||||||
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len
|
|
||||||
if master_process:
|
|
||||||
print(f"tokens per iteration will be: {tokens_per_iter:,}")
|
|
||||||
print(f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch size * {max_seq_len} max seq len")
|
|
||||||
|
|
||||||
if master_process:
|
|
||||||
os.makedirs(out_dir, exist_ok=True)
|
|
||||||
torch.manual_seed(1337 + seed_offset)
|
|
||||||
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
|
||||||
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
|
||||||
device_type = "cuda" if "cuda" in device else "cpu" # for later use in torch.autocast
|
|
||||||
# note: float16 data type will automatically use a GradScaler
|
|
||||||
ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[dtype]
|
|
||||||
ctx = (
|
|
||||||
nullcontext()
|
|
||||||
if device_type == "cpu"
|
|
||||||
else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
|
|
||||||
)
|
|
||||||
|
|
||||||
# task-specific setup
|
|
||||||
iter_batches = partial(
|
|
||||||
Task.iter_batches,
|
|
||||||
batch_size=batch_size,
|
|
||||||
max_seq_len=max_seq_len,
|
|
||||||
vocab_size=vocab_size,
|
|
||||||
vocab_source=vocab_source,
|
|
||||||
device=device,
|
|
||||||
num_workers=0,
|
|
||||||
)
|
|
||||||
|
|
||||||
# init these up here, can override if init_from='resume' (i.e. from a checkpoint)
|
|
||||||
iter_num = 0
|
|
||||||
best_val_loss = 1e9
|
|
||||||
|
|
||||||
# model init
|
|
||||||
model_args = dict(
|
|
||||||
dim=dim,
|
|
||||||
n_layers=n_layers,
|
|
||||||
n_heads=n_heads,
|
|
||||||
n_kv_heads=n_kv_heads,
|
|
||||||
vocab_size=vocab_size,
|
|
||||||
multiple_of=multiple_of,
|
|
||||||
max_seq_len=max_seq_len,
|
|
||||||
dropout=dropout,
|
|
||||||
) # start with model_args from command line
|
|
||||||
if init_from == "scratch":
|
|
||||||
# init a new model from scratch
|
|
||||||
print("Initializing a new model from scratch")
|
|
||||||
gptconf = ModelArgs(**model_args)
|
|
||||||
model = Transformer(gptconf)
|
|
||||||
elif init_from == "resume":
|
|
||||||
print(f"Resuming training from {out_dir}")
|
|
||||||
# resume training from a checkpoint.
|
|
||||||
ckpt_path = os.path.join(out_dir, "ckpt.pt")
|
|
||||||
checkpoint = torch.load(ckpt_path, map_location=device)
|
|
||||||
checkpoint_model_args = checkpoint["model_args"]
|
|
||||||
# force these config attributes to be equal otherwise we can't even resume training
|
|
||||||
# the rest of the attributes (e.g. dropout) can stay as desired from command line
|
|
||||||
for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]:
|
|
||||||
model_args[k] = checkpoint_model_args[k]
|
|
||||||
# create the model
|
|
||||||
gptconf = ModelArgs(**model_args)
|
|
||||||
model = Transformer(gptconf)
|
|
||||||
state_dict = checkpoint["model"]
|
|
||||||
# fix the keys of the state dictionary :(
|
|
||||||
# honestly no idea how checkpoints sometimes get this prefix, have to debug more
|
|
||||||
unwanted_prefix = "_orig_mod."
|
|
||||||
for k, v in list(state_dict.items()):
|
|
||||||
if k.startswith(unwanted_prefix):
|
|
||||||
state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
|
|
||||||
model.load_state_dict(state_dict)
|
|
||||||
iter_num = checkpoint["iter_num"]
|
|
||||||
best_val_loss = checkpoint["best_val_loss"]
|
|
||||||
model.to(device)
|
|
||||||
|
|
||||||
# initialize a GradScaler. If enabled=False scaler is a no-op
|
|
||||||
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == "float16"))
|
|
||||||
|
|
||||||
# optimizer
|
|
||||||
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
|
|
||||||
if init_from == "resume" and "optimizer" in checkpoint:
|
|
||||||
optimizer.load_state_dict(checkpoint["optimizer"])
|
|
||||||
checkpoint = None # free up memory
|
|
||||||
|
|
||||||
# compile the model
|
|
||||||
if compile:
|
|
||||||
print("compiling the model... (takes a ~minute)")
|
|
||||||
unoptimized_model = model
|
|
||||||
model = torch.compile(model) # requires PyTorch 2.0
|
|
||||||
|
|
||||||
# wrap model into DDP container
|
|
||||||
if ddp:
|
|
||||||
# Ignore the `freqs_cis` buffer so that DDP does not broadcast it at
|
|
||||||
# construction time since NCCL does not support `ComplexFloat`
|
|
||||||
prefix = "_orig_mod." if compile else ""
|
|
||||||
model._ddp_params_and_buffers_to_ignore = {prefix + "freqs_cis"}
|
|
||||||
model = DDP(model, device_ids=[ddp_local_rank])
|
|
||||||
|
|
||||||
# helps estimate an arbitrarily accurate loss over either split using many batches
|
|
||||||
@torch.no_grad()
|
|
||||||
def estimate_loss():
|
|
||||||
out = {}
|
|
||||||
model.eval()
|
|
||||||
for split in ["train", "val"]:
|
|
||||||
batch_iter = iter_batches(split=split)
|
|
||||||
losses = torch.zeros(eval_iters) # keep on CPU
|
|
||||||
for k in range(eval_iters):
|
|
||||||
X, Y = next(batch_iter)
|
|
||||||
with ctx:
|
|
||||||
logits = model(X, Y)
|
|
||||||
loss = raw_model.last_loss
|
|
||||||
losses[k] = loss.item()
|
|
||||||
out[split] = losses.mean()
|
|
||||||
model.train()
|
|
||||||
return out
|
|
||||||
|
|
||||||
# learning rate decay scheduler (cosine with warmup)
|
|
||||||
def get_lr(it):
|
|
||||||
# 1) linear warmup for warmup_iters steps
|
|
||||||
if it < warmup_iters:
|
|
||||||
return learning_rate * it / warmup_iters
|
|
||||||
# 2) if it > lr_decay_iters, return min learning rate
|
|
||||||
if it > lr_decay_iters:
|
|
||||||
return min_lr
|
|
||||||
# 3) in between, use cosine decay down to min learning rate
|
|
||||||
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
|
|
||||||
assert 0 <= decay_ratio <= 1
|
|
||||||
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
|
|
||||||
return min_lr + coeff * (learning_rate - min_lr)
|
|
||||||
|
|
||||||
# logging
|
|
||||||
if wandb_log and master_process:
|
|
||||||
import wandb
|
|
||||||
wandb.init(project=wandb_project, name=wandb_run_name, config=config)
|
|
||||||
|
|
||||||
# training loop
|
|
||||||
train_batch_iter = iter_batches(split="train")
|
|
||||||
X, Y = next(train_batch_iter) # fetch the very first batch
|
|
||||||
t0 = time.time()
|
|
||||||
local_iter_num = 0 # number of iterations in the lifetime of this process
|
|
||||||
raw_model = model.module if ddp else model # unwrap DDP container if needed
|
|
||||||
running_mfu = -1.0
|
|
||||||
while True:
|
|
||||||
# determine and set the learning rate for this iteration
|
|
||||||
lr = get_lr(iter_num) if decay_lr else learning_rate
|
|
||||||
for param_group in optimizer.param_groups:
|
|
||||||
param_group["lr"] = lr
|
|
||||||
|
|
||||||
# evaluate the loss on train/val sets and write checkpoints
|
|
||||||
if iter_num % eval_interval == 0 and master_process:
|
|
||||||
losses = estimate_loss()
|
|
||||||
print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
|
|
||||||
if wandb_log:
|
|
||||||
try:
|
|
||||||
wandb.log(
|
|
||||||
{
|
|
||||||
"iter": iter_num,
|
|
||||||
"tokens": iter_num * tokens_per_iter,
|
|
||||||
"loss/train": losses["train"],
|
|
||||||
"loss/val": losses["val"],
|
|
||||||
"lr": lr,
|
|
||||||
"mfu": running_mfu * 100, # convert to percentage
|
|
||||||
}
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"logging to wandb failed: {e}")
|
|
||||||
if losses["val"] < best_val_loss or always_save_checkpoint:
|
|
||||||
best_val_loss = losses["val"]
|
|
||||||
if iter_num > 0:
|
|
||||||
checkpoint = {
|
|
||||||
"model": raw_model.state_dict(),
|
|
||||||
"optimizer": optimizer.state_dict(),
|
|
||||||
"model_args": model_args,
|
|
||||||
"iter_num": iter_num,
|
|
||||||
"best_val_loss": best_val_loss,
|
|
||||||
"config": config,
|
|
||||||
}
|
|
||||||
print(f"saving checkpoint to {out_dir}")
|
|
||||||
torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt"))
|
|
||||||
raw_model.export(os.path.join(out_dir, "model.bin"))
|
|
||||||
if iter_num == 0 and eval_only:
|
|
||||||
break
|
|
||||||
|
|
||||||
# forward backward update, with optional gradient accumulation to simulate larger batch size
|
|
||||||
# and using the GradScaler if data type is float16
|
|
||||||
for micro_step in range(gradient_accumulation_steps):
|
|
||||||
if ddp:
|
|
||||||
# in DDP training we only need to sync gradients at the last micro step.
|
|
||||||
# the official way to do this is with model.no_sync() context manager, but
|
|
||||||
# I really dislike that this bloats the code and forces us to repeat code
|
|
||||||
# looking at the source of that context manager, it just toggles this variable
|
|
||||||
model.require_backward_grad_sync = micro_step == gradient_accumulation_steps - 1
|
|
||||||
with ctx:
|
|
||||||
logits = model(X, Y)
|
|
||||||
loss = raw_model.last_loss
|
|
||||||
loss = loss / gradient_accumulation_steps
|
|
||||||
# immediately async prefetch next batch while model is doing the forward pass on the GPU
|
|
||||||
X, Y = next(train_batch_iter)
|
|
||||||
# backward pass, with gradient scaling if training in fp16
|
|
||||||
scaler.scale(loss).backward()
|
|
||||||
# clip the gradient
|
|
||||||
if grad_clip != 0.0:
|
|
||||||
scaler.unscale_(optimizer)
|
|
||||||
torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
|
|
||||||
# step the optimizer and scaler if training in fp16
|
|
||||||
scaler.step(optimizer)
|
|
||||||
scaler.update()
|
|
||||||
# flush the gradients as soon as we can, no need for this memory anymore
|
|
||||||
optimizer.zero_grad(set_to_none=True)
|
|
||||||
|
|
||||||
# timing and logging
|
|
||||||
t1 = time.time()
|
|
||||||
dt = t1 - t0
|
|
||||||
t0 = t1
|
|
||||||
if iter_num % log_interval == 0 and master_process:
|
|
||||||
# get loss as float, scale up due to the divide above. note: this is a CPU-GPU sync point
|
|
||||||
lossf = loss.item() * gradient_accumulation_steps
|
|
||||||
if local_iter_num >= 5: # let the training loop settle a bit
|
|
||||||
mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
|
|
||||||
running_mfu = mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
|
|
||||||
print(
|
|
||||||
f"{iter_num} | loss {lossf:.4f} | lr {lr:e} | {dt*1000:.2f}ms | mfu {running_mfu*100:.2f}%"
|
|
||||||
)
|
|
||||||
iter_num += 1
|
|
||||||
local_iter_num += 1
|
|
||||||
|
|
||||||
# termination conditions
|
|
||||||
if iter_num > max_iters:
|
|
||||||
break
|
|
||||||
|
|
||||||
if ddp:
|
|
||||||
destroy_process_group()
|
|
||||||
-126
@@ -1,126 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Trains a sentencepiece tokenizer model on a bunch of given data, my best
|
|
||||||
# effort attempt to replicate how Meta trained their Llama 2 tokenizer.
|
|
||||||
|
|
||||||
# usage: $ train_vocab.sh <input> <model_prefix> <vocab_size>
|
|
||||||
# example:
|
|
||||||
# ./train_vocab.sh tiny.txt tokenizer_tiny 1024
|
|
||||||
# requirements:
|
|
||||||
# install https://github.com/google/sentencepiece
|
|
||||||
|
|
||||||
# check if the correct number of arguments are provided
|
|
||||||
if [ $# -ne 3 ]; then
|
|
||||||
echo "Usage: $0 <input> <model_prefix> <vocab_size>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# assign command-line arguments to variables
|
|
||||||
input=$1
|
|
||||||
model_prefix=$2
|
|
||||||
vocab_size=$3
|
|
||||||
|
|
||||||
# check if input file exists
|
|
||||||
if [ ! -f "$input" ]; then
|
|
||||||
echo "Usage: $0 <input> <model_prefix> <vocab_size>"
|
|
||||||
echo "input '$input' not found."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# check if vocab_size is a positive integer
|
|
||||||
if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -lt 1 ]; then
|
|
||||||
echo "Usage: $0 <input> <model_prefix> <vocab_size>"
|
|
||||||
echo "vocab_size size must be a positive integer."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Print the processed inputs
|
|
||||||
echo "Input: $input"
|
|
||||||
echo "Model Prefix: $model_prefix"
|
|
||||||
echo "Vocabulary Size: $vocab_size"
|
|
||||||
|
|
||||||
# train a sentencepiece tokenizer model
|
|
||||||
# Llama 2 config can be printed as follows:
|
|
||||||
|
|
||||||
# import sentencepiece.sentencepiece_model_pb2
|
|
||||||
# mp = sentencepiece.sentencepiece_model_pb2.ModelProto()
|
|
||||||
# mp.ParseFromString(open("tokenizer.model", "rb").read())
|
|
||||||
# print(mp.trainer_spec)
|
|
||||||
# print(mp.normalizer_spec)
|
|
||||||
|
|
||||||
# this gives:
|
|
||||||
|
|
||||||
# trainer_spec {
|
|
||||||
# input: "/large_experiments/theorem/datasets/MERGED/all.test1.merged"
|
|
||||||
# model_prefix: "spm_model_32k_200M_charcov099995_allowWSO__v2"
|
|
||||||
# model_type: BPE
|
|
||||||
# vocab_size: 32000
|
|
||||||
# self_test_sample_size: 0
|
|
||||||
# input_format: "text"
|
|
||||||
# character_coverage: 0.9999499917030334
|
|
||||||
# input_sentence_size: 200000000
|
|
||||||
# seed_sentencepiece_size: 1000000
|
|
||||||
# shrinking_factor: 0.75
|
|
||||||
# num_threads: 80
|
|
||||||
# num_sub_iterations: 2
|
|
||||||
# max_sentence_length: 4192
|
|
||||||
# shuffle_input_sentence: true
|
|
||||||
# max_sentencepiece_length: 16
|
|
||||||
# split_by_unicode_script: true
|
|
||||||
# split_by_whitespace: true
|
|
||||||
# split_by_number: true
|
|
||||||
# treat_whitespace_as_suffix: false
|
|
||||||
# split_digits: true
|
|
||||||
# allow_whitespace_only_pieces: true
|
|
||||||
# vocabulary_output_piece_score: true
|
|
||||||
# hard_vocab_limit: true
|
|
||||||
# use_all_vocab: false
|
|
||||||
# byte_fallback: true
|
|
||||||
# required_chars: ""
|
|
||||||
# unk_id: 0
|
|
||||||
# bos_id: 1
|
|
||||||
# eos_id: 2
|
|
||||||
# pad_id: -1
|
|
||||||
# unk_surface: " \342\201\207 "
|
|
||||||
# unk_piece: "<unk>"
|
|
||||||
# bos_piece: "<s>"
|
|
||||||
# eos_piece: "</s>"
|
|
||||||
# pad_piece: "<pad>"
|
|
||||||
# train_extremely_large_corpus: false
|
|
||||||
# enable_differential_privacy: false
|
|
||||||
# differential_privacy_noise_level: 0.0
|
|
||||||
# differential_privacy_clipping_threshold: 0
|
|
||||||
# }
|
|
||||||
# normalizer_spec {
|
|
||||||
# name: "identity"
|
|
||||||
# precompiled_charsmap: ""
|
|
||||||
# add_dummy_prefix: true
|
|
||||||
# remove_extra_whitespaces: false
|
|
||||||
# normalization_rule_tsv: ""
|
|
||||||
# }
|
|
||||||
|
|
||||||
# let's now use spm_train to train this exact model
|
|
||||||
# options docs: https://github.com/google/sentencepiece/blob/master/doc/options.md
|
|
||||||
|
|
||||||
# we'll depart on a few settings:
|
|
||||||
# character_coverage -> 1.0
|
|
||||||
|
|
||||||
# other important notes:
|
|
||||||
# --split-digits = true, per the paper
|
|
||||||
# --allow_whitespace_only_pieces is true, default in spm is false
|
|
||||||
# --byte_fallback is true, default in spm is false
|
|
||||||
# --normalization_rule_name is identity, default in spm is nmt_nfkc
|
|
||||||
|
|
||||||
spm_train --input="$input" \
|
|
||||||
--model_prefix="$model_prefix" \
|
|
||||||
--model_type=bpe \
|
|
||||||
--vocab_size="$vocab_size" \
|
|
||||||
--self_test_sample_size=0 \
|
|
||||||
--input_format="text" \
|
|
||||||
--character_coverage=1.0 \
|
|
||||||
--num_threads="$(nproc)" \
|
|
||||||
--split_digits=true \
|
|
||||||
--allow_whitespace_only_pieces=true \
|
|
||||||
--byte_fallback=true \
|
|
||||||
--unk_surface=" \342\201\207 " \
|
|
||||||
--normalization_rule_name=identity \
|
|
||||||
Reference in New Issue
Block a user