draft of int8 attempt number two

2023-08-26 22:28:08 +00:00
4 changed files with 1105 additions and 1 deletions
@@ -6,11 +6,13 @@ CC = gcc
 .PHONY: run
 run: run.c
 	$(CC) -O3 -o run run.c -lm
 	$(CC) -O3 -o runq runq.c -lm
 # useful for a debug build, can then e.g. analyze with valgrind, example:
 # $ valgrind --leak-check=full ./run out/model.bin -n 3
 rundebug: run.c
 	$(CC) -g -o run run.c -lm
 	$(CC) -g -o runq runq.c -lm
 # https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html
 # https://simonbyrne.github.io/notes/fastmath/
@@ -24,6 +26,7 @@ rundebug: run.c
 .PHONY: runfast
 runfast: run.c
 	$(CC) -Ofast -o run run.c -lm
 	$(CC) -Ofast -o runq runq.c -lm
 # additionally compiles with OpenMP, allowing multithreaded runs
 # make sure to also enable multiple threads when running, e.g.:
@@ -31,19 +34,23 @@ runfast: run.c
 .PHONY: runomp
 runomp: run.c
 	$(CC) -Ofast -fopenmp -march=native run.c  -lm  -o run
 	$(CC) -Ofast -fopenmp -march=native runq.c  -lm  -o runq
 .PHONY: win64
 win64:
 	x86_64-w64-mingw32-gcc -Ofast -D_WIN32 -o run.exe -I. run.c win.c
 	x86_64-w64-mingw32-gcc -Ofast -D_WIN32 -o runq.exe -I. runq.c win.c
 # compiles with gnu99 standard flags for amazon linux, coreos, etc. compatibility
 .PHONY: rungnu
 rungnu:
 	$(CC) -Ofast -std=gnu11 -o run run.c -lm
 	$(CC) -Ofast -std=gnu11 -o runq runq.c -lm
 .PHONY: runompgnu
 runompgnu:
 	$(CC) -Ofast -fopenmp -std=gnu11 run.c  -lm  -o run
 	$(CC) -Ofast -fopenmp -std=gnu11 runq.c  -lm  -o runq
 # run all tests
 .PHONY: test
@@ -66,3 +73,4 @@ testcc:
 .PHONY: clean
 clean:
 	rm -f run
 	rm -f runq
@@ -109,7 +109,6 @@ Chat with Code Llama Instruct:
 python export.py codellama2_7b_instruct.bin --meta-llama /path/to/CodeLlama-7b-Instruct
 python tokenizer.py --tokenizer-model=/path/to/CodeLlama-7b-Instruct/tokenizer.model
 ./run codellama2_7b_instruct.bin -m chat -z /path/to/CodeLlama-7b-Instruct/tokenizer.bin
 ```
 ## hugginface models
@@ -406,6 +406,12 @@ def load_hf_model(model_path):
 # API entrypoint
 def model_export(model, filepath, version):
    """
    Versions docs:
    v0: legacy llama2.c float format, DEPRECATED
    v1: float32 export
    v2: int8 quantized Q8_0 export, similar to llama.cpp, in groups
    """
    if version == 0:
        legacy_export(model, filepath)
    elif version == 1: