diff --git a/README.md b/README.md index 6f092ce..37b357d 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,22 @@ Then chat with it by specifying the chat mode using the `-m` flag, e.g.: ./run llama2_7b_chat.bin -m chat ``` +You can also try Meta's Code Llama models even if support for them is incomplete. +Make sure to build the tokenizer for the plain and instruct variants and pass it when doing inference. + +```bash +python export.py codellama2_7b.bin --meta-llama /path/to/CodeLlama-7b +python tokenizer.py --tokenizer-model=/path/to/CodeLlama-7b/tokenizer.model +./run codellama2_7b.bin -z /path/to/CodeLlama-7b/tokenizer.bin +``` + +Chat with Code Llama Instruct: + +```bash +python export.py codellama2_7b_instruct.bin --meta-llama /path/to/CodeLlama-7b-Instruct +python tokenizer.py --tokenizer-model=/path/to/CodeLlama-7b-Instruct/tokenizer.model +./run codellama2_7b_instruct.bin -m chat -z /path/to/CodeLlama-7b-Instruct/tokenizer.bin + ## hugginface models We can load any huggingface models that use the Llama 2 architecture. See the script [export.py](export.py) and the `--hf` flag to export the model .bin file. diff --git a/export.py b/export.py index a60d7cf..4143f70 100644 --- a/export.py +++ b/export.py @@ -323,9 +323,10 @@ def load_meta_model(model_path): config.multiple_of = params["multiple_of"] config.norm_eps = params["norm_eps"] - config.vocab_size = 32000 + config.vocab_size = state_dict['tok_embeddings.weight'].shape[0] config.max_seq_len = 2048 + # create a new Transformer object and set weights model = Transformer(config)