From fbe324fc5ab61eaab3b8f74694be5b3870d3d5ee Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Fri, 25 Aug 2023 14:54:05 +0000
Subject: [PATCH] adjust things a bit

---
 README.md | 19 ++++++++++++++-----
 run.c     |  8 ++++++--
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index e9df1f6..8b05b49 100644
--- a/README.md
+++ b/README.md
@@ -83,6 +83,18 @@ This ran at about 4 tokens/s compiled with [OpenMP](#OpenMP) on 96 threads on my
 
 base models... ¯\\_(ツ)_/¯. Since we can inference the base model, it should be possible to also inference the chat model quite easily, and have a conversation with it. And if we can find a way to run 7B more efficiently, we can start adding LoRA to our training script, and going wild with finetunes all within the repo!
 
+You can also chat with the Llama Chat models. Export the chat model exactly as above:
+
+```bash
+python export.py llama2_7b_chat.bin --meta-llama /path/to/7B-chat
+```
+
+Then chat with it by specifying the chat mode using the `-m` flag, e.g.:
+
+```bash
+./run llama2_7b_chat.bin -m chat
+```
+
 ## hugginface models
 
 We can load any huggingface models that use the Llama 2 architecture. See the script [export.py](export.py) and the `--hf` flag to export the model .bin file.
@@ -207,8 +219,7 @@ You can also experiment with replacing `gcc` with `clang`.
 
 If compiling with gcc, try experimenting with `-funroll-all-loops`, see PR [#183](https://github.com/karpathy/llama2.c/pull/183)
 
-### OpenMP
-Big improvements can also be achieved by compiling with OpenMP, which "activates" the `#pragma omp parallel for` inside the matmul and attention, allowing the work in the loops to be split up over multiple processors.
+**OpenMP**. Big improvements can also be achieved by compiling with OpenMP, which "activates" the `#pragma omp parallel for` inside the matmul and attention, allowing the work in the loops to be split up over multiple processors.
 You'll need to install the OpenMP library and the clang compiler first (e.g. `apt install clang libomp-dev` on ubuntu). Then you can compile with `make runomp`, which does:
 
 ```bash
@@ -324,12 +335,10 @@ If your candidate PRs have elements of these it doesn't mean they won't get merg
 
 ## unsorted todos
 
-- support Llama 2 7B Chat models with a Chat UI/UX in run.c, very similar to llama.cpp
-- ability to calculate perplexity in run.c, exactly as done in llama.cpp
 - add support in run.c of reading version 1+ files from export, later deprecate "version 0"
-- add more tests inside [test.c](test.c) (call for help!)
 - runq.c (int8 quantization) add
 - run.cu (CUDA) investigate and merge
+- add more tests inside [test.c](test.c)
 - make it easier to add a new dataset with not too much pain
 - (LoRA) finetuning and export of Llama 2 models
 
diff --git a/run.c b/run.c
index 40f68e6..9df918e 100644
--- a/run.c
+++ b/run.c
@@ -800,16 +800,20 @@ void read_stdin(const char* guide, char* buffer, size_t bufsize) {
 
 // ----------------------------------------------------------------------------
 // chat loop
+// I manually inspected the tokens for a few chat conversations compared to
+// python reference and that seemed ok, but this was not thoroughly tested and
+// is not safely implemented, it's more a proof of concept atm.
 
 void chat(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
           char *cli_user_prompt, char *cli_system_prompt, int steps) {
 
     // buffers for reading the system prompt and user prompt from stdin
+    // you'll notice they are soomewhat haphazardly and unsafely set atm
     char system_prompt[512];
     char user_prompt[512];
-    char rendered_prompt[512];
+    char rendered_prompt[1152];
     int num_prompt_tokens = 0;
-    int* prompt_tokens = (int*)malloc(512 * sizeof(int));
+    int* prompt_tokens = (int*)malloc(1152 * sizeof(int));
     int user_idx;
 
     // start the main loop