now that the export.py HF functionality is in master, we can delete this file, and update the readme

This commit is contained in:
Andrej Karpathy
2023-08-21 04:58:19 +00:00
parent 801c68f5a1
commit ea44f53568
2 changed files with 5 additions and 114 deletions
+5 -1
View File
@@ -71,7 +71,7 @@ For this we need to install the python dependencies (`pip install -r requirement
python export_meta_llama_bin.py path/to/llama/model/7B llama2_7b.bin
```
The export will take ~10 minutes or so and generate a 26GB file (the weights of the 7B model in float32) called `llama2_7b.bin` in the current directory. It has been [reported](https://github.com/karpathy/llama2.c/pull/85) that despite efforts, the 13B export currently doesn't work for unknown reasons (accepting PRs for fix). We can run the model as normal:
The export will take ~10 minutes or so and generate a 26GB file (the weights of the 7B model in float32) called `llama2_7b.bin` in the current directory. It has been [reported](https://github.com/karpathy/llama2.c/pull/85) that despite efforts. I would not attempt to run anything above 7B right now for two reasons: first, 13B+ currently doesn't work because of integer flow in pointer arithmetic, which is yet to be fixed, and second, even if it were fixed, this repo is doing float32 inference right now, so it would be fairly unusably slow. Once the export is done, we can run it:
```bash
./run llama2_7b.bin
@@ -83,6 +83,10 @@ This ran at about 4 tokens/s compiled with [OpenMP](#OpenMP) on 96 threads on my
base models... ¯\\_(ツ)_/¯. Since we can inference the base model, it should be possible to also inference the chat model quite easily, and have a conversation with it. And if we can find a way to run 7B more efficiently, we can start adding LoRA to our training script, and going wild with finetunes all within the repo!
## hugginface models
We can load any huggingface models that use the Llama 2 architecture. See the script [export.py](export.py) and the `--hf` flag to export the model .bin file.
## models
For the sake of examples of smaller, from-scratch models, I trained a small model series on TinyStories. All of these trained in a few hours on my training setup (4X A100 40GB GPUs). The 110M took around 24 hours. I am hosting them on huggingface hub [tinyllamas](https://huggingface.co/karpathy/tinyllamas), both in the original PyTorch .pt, and also in the llama2.c format .bin:
-113
View File
@@ -1,113 +0,0 @@
"""
This script exports the Llama 2 weights in llama2c.bin format.
"""
import os
import sys
import struct
from pathlib import Path
import json
import torch
from model import precompute_freqs_cis
def export(p, state_dict, filepath='model.bin'):
"""export the model weights in fp32 into .bin file to be read from C"""
f = open(filepath, 'wb')
def serialize(key):
print(f"writing {key}...")
t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy()
f.write(memoryview(t))
del state_dict[key]
# first write out the header
hidden_dim = state_dict['model.layers.0.mlp.gate_proj.weight'].shape[0]
p['vocab_size'] = 32000
p['max_seq_len'] = 2048
n_kv_heads = p.get('n_kv_heads') or p['n_heads']
header = struct.pack(
'iiiiiii',
p['dim'], hidden_dim, p['n_layers'], p['n_heads'],
n_kv_heads, -p['vocab_size'], p['max_seq_len']
)
# NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present
# in the checkpoint and should be loaded.
f.write(header)
# next write out the embedding weights
print("writing tok_embeddings...")
serialize('model.embed_tokens.weight')
# now all the layers
# attention weights
for i in range(p['n_layers']): serialize(f'model.layers.{i}.input_layernorm.weight')
for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.q_proj.weight')
for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.k_proj.weight')
for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.v_proj.weight')
for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.o_proj.weight')
# ffn weights
for i in range(p['n_layers']): serialize(f'model.layers.{i}.post_attention_layernorm.weight')
for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.gate_proj.weight')
for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.down_proj.weight')
for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.up_proj.weight')
# final rmsnorm
serialize('model.norm.weight')
# freqs_cos, freqs_sin
freqs_cos, freqs_sin = precompute_freqs_cis(p['dim'] // p['n_heads'], p['max_seq_len'] * 2)
state_dict['freqs_cos'] = freqs_cos[:p['max_seq_len']]
state_dict['freqs_sin'] = freqs_sin[:p['max_seq_len']]
# check if this requires addtional conversion
serialize('freqs_cos')
serialize('freqs_sin')
# finally write the output weights
serialize('lm_head.weight')
f.close()
print(f"wrote {filepath}")
def concat_weights(models):
state_dict = {}
for name in list(models[0]):
tensors = [model[name] for model in models]
if len(tensors) == 1 or len(tensors[0].shape) == 1:
state_dict[name] = tensors[0]
continue
is_axis_1 = (
name.startswith('model.embed_tokens.weight')
or name.endswith('.self_attn.o_proj.weight')
or name.endswith('.mlp.down_proj.weight')
)
axis = 1 if is_axis_1 else 0
state_dict[name] = torch.cat(tensors, dim=axis)
for model in models:
del model[name]
return state_dict
def load_and_export(model_path, output_path):
params_path = os.path.join(model_path, 'params.json')
with open(params_path) as f:
params = json.load(f)
print(params)
model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
models = [torch.load(p, map_location='cpu') for p in model_paths]
state_dict = concat_weights(models)
del models
export(params, state_dict, output_path)
if __name__ == '__main__':
if len(sys.argv) == 1:
print('[Llama model folder path] [output path]')
exit()
model_path = sys.argv[1]
output_path = sys.argv[2]
load_and_export(model_path, output_path)