From ea44f535682658f3c586719ef52fc985240461fe Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Mon, 21 Aug 2023 04:58:19 +0000
Subject: [PATCH] now that the export.py HF functionality is in master, we can
 delete this file, and update the readme

---
 README.md                   |   6 +-
 export_meta_llama_hf_bin.py | 113 ------------------------------------
 2 files changed, 5 insertions(+), 114 deletions(-)
 delete mode 100644 export_meta_llama_hf_bin.py

diff --git a/README.md b/README.md
index 7d3393b..ff15005 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ For this we need to install the python dependencies (`pip install -r requirement
 python export_meta_llama_bin.py path/to/llama/model/7B llama2_7b.bin
 ```
 
-The export will take ~10 minutes or so and generate a 26GB file (the weights of the 7B model in float32) called `llama2_7b.bin` in the current directory. It has been [reported](https://github.com/karpathy/llama2.c/pull/85) that despite efforts, the 13B export currently doesn't work for unknown reasons (accepting PRs for fix). We can run the model as normal:
+The export will take ~10 minutes or so and generate a 26GB file (the weights of the 7B model in float32) called `llama2_7b.bin` in the current directory. It has been [reported](https://github.com/karpathy/llama2.c/pull/85) that despite efforts. I would not attempt to run anything above 7B right now for two reasons: first, 13B+ currently doesn't work because of integer flow in pointer arithmetic, which is yet to be fixed, and second, even if it were fixed, this repo is doing float32 inference right now, so it would be fairly unusably slow. Once the export is done, we can run it:
 
 ```bash
 ./run llama2_7b.bin
@@ -83,6 +83,10 @@ This ran at about 4 tokens/s compiled with [OpenMP](#OpenMP) on 96 threads on my
 
 base models... ¯\\_(ツ)_/¯. Since we can inference the base model, it should be possible to also inference the chat model quite easily, and have a conversation with it. And if we can find a way to run 7B more efficiently, we can start adding LoRA to our training script, and going wild with finetunes all within the repo!
 
+## hugginface models
+
+We can load any huggingface models that use the Llama 2 architecture. See the script [export.py](export.py) and the `--hf` flag to export the model .bin file.
+
 ## models
 
 For the sake of examples of smaller, from-scratch models, I trained a small model series on TinyStories. All of these trained in a few hours on my training setup (4X A100 40GB GPUs). The 110M took around 24 hours. I am hosting them on huggingface hub [tinyllamas](https://huggingface.co/karpathy/tinyllamas), both in the original PyTorch .pt, and also in the llama2.c format .bin:
diff --git a/export_meta_llama_hf_bin.py b/export_meta_llama_hf_bin.py
deleted file mode 100644
index e3a8c73..0000000
--- a/export_meta_llama_hf_bin.py
+++ /dev/null
@@ -1,113 +0,0 @@
-"""
-This script exports the Llama 2 weights in llama2c.bin format.
-"""
-import os
-import sys
-import struct
-from pathlib import Path
-import json
-
-import torch
-
-from model import precompute_freqs_cis
-
-
-def export(p, state_dict, filepath='model.bin'):
-    """export the model weights in fp32 into .bin file to be read from C"""
-    f = open(filepath, 'wb')
-
-    def serialize(key):
-        print(f"writing {key}...")
-        t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy()
-        f.write(memoryview(t))
-        del state_dict[key]
-
-    # first write out the header
-    hidden_dim = state_dict['model.layers.0.mlp.gate_proj.weight'].shape[0]
-    p['vocab_size'] = 32000
-    p['max_seq_len'] = 2048
-
-    n_kv_heads = p.get('n_kv_heads') or p['n_heads']
-    header = struct.pack(
-        'iiiiiii',
-        p['dim'], hidden_dim, p['n_layers'], p['n_heads'],
-        n_kv_heads, -p['vocab_size'], p['max_seq_len']
-    )
-    # NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present
-    # in the checkpoint and should be loaded.
-    f.write(header)
-
-    # next write out the embedding weights
-    print("writing tok_embeddings...")
-    serialize('model.embed_tokens.weight')
-
-    # now all the layers
-    # attention weights
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.input_layernorm.weight')
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.q_proj.weight')
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.k_proj.weight')
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.v_proj.weight')
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.self_attn.o_proj.weight')
-    # ffn weights
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.post_attention_layernorm.weight')
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.gate_proj.weight')
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.down_proj.weight')
-    for i in range(p['n_layers']): serialize(f'model.layers.{i}.mlp.up_proj.weight')
-
-    # final rmsnorm
-    serialize('model.norm.weight')
-    # freqs_cos, freqs_sin
-    freqs_cos, freqs_sin = precompute_freqs_cis(p['dim'] // p['n_heads'], p['max_seq_len'] * 2)
-    state_dict['freqs_cos'] = freqs_cos[:p['max_seq_len']]
-    state_dict['freqs_sin'] = freqs_sin[:p['max_seq_len']]
-    # check if this requires addtional conversion
-    serialize('freqs_cos')
-    serialize('freqs_sin')
-
-    # finally write the output weights
-    serialize('lm_head.weight')
-
-    f.close()
-    print(f"wrote {filepath}")
-
-
-def concat_weights(models):
-    state_dict = {}
-    for name in list(models[0]):
-        tensors = [model[name] for model in models]
-        if len(tensors) == 1 or len(tensors[0].shape) == 1:
-            state_dict[name] = tensors[0]
-            continue
-        is_axis_1 = (
-            name.startswith('model.embed_tokens.weight')
-            or name.endswith('.self_attn.o_proj.weight')
-            or name.endswith('.mlp.down_proj.weight')
-        )
-        axis = 1 if is_axis_1 else 0
-        state_dict[name] = torch.cat(tensors, dim=axis)
-        for model in models:
-            del model[name]
-    return state_dict
-
-
-def load_and_export(model_path, output_path):
-    params_path = os.path.join(model_path, 'params.json')
-    with open(params_path) as f:
-        params = json.load(f)
-        print(params)
-
-    model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
-    models = [torch.load(p, map_location='cpu') for p in model_paths]
-    state_dict = concat_weights(models)
-    del models
-    export(params, state_dict, output_path)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) == 1:
-        print('[Llama model folder path] [output path]')
-        exit()
-
-    model_path = sys.argv[1]
-    output_path = sys.argv[2]
-    load_and_export(model_path, output_path)