From 12112bfa480cca0e451c979103183511b02d1a13 Mon Sep 17 00:00:00 2001
From: ltoniazzi <lt6ga@protonmail.com>
Date: Fri, 21 Jun 2024 16:44:33 +0100
Subject: [PATCH 01/14] Add basic cpu setup

---
 BRANCH_SETUP.md   |  48 ++++++++
 common/common.cpp |   8 ++
 common/common.h   |   1 +
 data/hot-lora.txt |   2 +
 ggml.c            |  46 +++++++
 ggml.h            |  19 +++
 llama.cpp         | 299 +++++++++++++++++++++++++++++++++++++++++++++-
 llama.h           |   3 +
 8 files changed, 423 insertions(+), 3 deletions(-)
 create mode 100644 BRANCH_SETUP.md
 create mode 100644 data/hot-lora.txt

diff --git a/BRANCH_SETUP.md b/BRANCH_SETUP.md
new file mode 100644
index 0000000000000..dac58d0d2de31
--- /dev/null
+++ b/BRANCH_SETUP.md
@@ -0,0 +1,48 @@
+# Setup this branch
+
+## Create a lora adpter bin file
+
+0. `mkdir models/open-llama` and download [Open-llama  (all files)](https://huggingface.co/openlm-research/open_llama_3b_v2/tree/main) in the folder `./models/open-llama`
+
+2. `mkdir data && touch data/hot-lora.txt` and write a couple of words in it.
+
+3. Run:
+    ```bash
+    # Convert base model to gguf
+    python3 convert-hf-to-gguf.py models/open-llama/
+    # Quantize base model
+    ./quantize ./models/open-llama/ggml-model-f16.gguf ./models/open-llama/ggml-model-q8_0.gguf Q8_0
+    # Obtain Lora adapter
+    ./finetune  --model-base models/open-llama/ggml-model-q8_0.gguf \
+    --checkpoint-in models/open-llama/chk-lora-ggml-model-q8_0-hot-lora-LATEST.gguf \
+    --checkpoint-out models/open-llama/chk-lora-ggml-model-q8_0-hot-lora-ITERATION.gguf \
+    --lora-out models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \
+    --train-data "data/hot-lora.txt" \
+    --save-every 1 \
+    --threads 1 \
+    --adam-iter 1 \
+    --batch 1 \
+    --ctx 16 \
+    --use-checkpointing
+    ```
+
+## Run main with adapter
+
+Run main with base model and lora adapter to hot-swap
+```bash
+./main ./models/open-llama/ggml-model-f16.gguf \
+--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \
+-ngl 0 \
+-n 128
+```
+
+With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (`lora_mul_mat`), but they are not moved to the buffer of the base tensors.
+
+# Logic
+
+
+
+# Current status
+
+- Only ony Lora adapter can be passed. 
+- GPU not supported
\ No newline at end of file
diff --git a/common/common.cpp b/common/common.cpp
index 1591790e6df4c..494258db0ed48 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -789,6 +789,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.model = argv[i];
         return true;
     }
+    if (arg == "-hl" || arg == "--hot-lora") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.hot_lora = argv[i];
+        return true;
+    }
     if (arg == "-md" || arg == "--model-draft") {
         if (++i >= argc) {
             invalid_param = true;
diff --git a/common/common.h b/common/common.h
index 2345d855eed3c..cd9d6370cf47f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -100,6 +100,7 @@ struct gpt_params {
 
     std::string model                = ""; // model path
     std::string model_draft          = ""; // draft model for speculative decoding
+    std::string hot_lora             = ""; // lora model path for hot swapping
     std::string model_alias          = "unknown"; // model alias
     std::string model_url            = ""; // model url to download
     std::string hf_repo              = ""; // HF repo
diff --git a/data/hot-lora.txt b/data/hot-lora.txt
new file mode 100644
index 0000000000000..c43186710e906
--- /dev/null
+++ b/data/hot-lora.txt
@@ -0,0 +1,2 @@
+
+ how are you?
diff --git a/ggml.c b/ggml.c
index 1fc77743bc7b9..0fb8dafbd2ab5 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4313,6 +4313,52 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam
     return NULL;
 }
 
+//////// LORA
+
+struct lora_tensor_pair* build_lora_weights_map(struct ggml_context* ctx) {
+    struct lora_tensor_pair* pair = malloc(sizeof(struct lora_tensor_pair));
+    if (!pair) return NULL;
+    pair->pairs = NULL;
+    pair->count = 0;
+    pair->capacity = 0;
+
+    struct ggml_object * obj = ctx->objects_begin;
+    char * const mem_buffer = ctx->mem_buffer;
+
+    while (obj != NULL) {
+        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
+            struct ggml_tensor * tensor = (struct ggml_tensor *)(mem_buffer + obj->offs);
+            char * tensor_name = tensor->name;
+
+            if (strlen(tensor_name) > 6 && (strcmp(tensor_name + strlen(tensor_name) - 6, ".loraA") == 0 ||
+                                            strcmp(tensor_name + strlen(tensor_name) - 6, ".loraB") == 0)) {
+                if (pair->count == pair->capacity) {
+                    pair->capacity = pair->capacity > 0 ? pair->capacity * 2 : 4;
+                    pair->pairs = realloc(pair->pairs, pair->capacity * sizeof(struct lora_tensor_info));
+                }
+
+                pair->pairs[pair->count].name = strdup(tensor_name);
+                pair->pairs[pair->count].tensor = tensor;
+                pair->count++;
+            }
+        }
+        obj = obj->next;
+    }
+
+    return pair;
+}
+
+void free_lora_tensor_pair(struct lora_tensor_pair* pair) {
+    if (!pair) return;
+    for (int i = 0; i < pair->count; i++) {
+        free(pair->pairs[i].name);
+    }
+    free(pair->pairs);
+    free(pair);
+}
+
+//////// LORA
+
 ////////////////////////////////////////////////////////////////////////////////
 
 // ggml_dup
diff --git a/ggml.h b/ggml.h
index 13502a3622fc4..d843699084840 100644
--- a/ggml.h
+++ b/ggml.h
@@ -835,6 +835,25 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
     GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
 
+    struct lora_tensor_info {
+        char* name;
+        struct ggml_tensor* tensor;
+        };
+
+        struct lora_tensor_pair {
+        struct lora_tensor_info* pairs; // Dynamic array of tensor pairs
+        int count;
+        int capacity;
+        };
+
+        // Function to build tensor pairs
+        struct lora_tensor_pair* build_lora_weights_map(struct ggml_context* ctx);
+
+        // Cleanup function for lora_tensor_pair
+        void free_lora_tensor_pair(struct lora_tensor_pair* pair);
+
+
+
     GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
     GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
     GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
diff --git a/llama.cpp b/llama.cpp
index 8b675ea993a38..58b6ff8640447 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -119,6 +119,212 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
 // helpers
 //
 
+///////// LORA
+
+struct lora_weights {
+    ggml_tensor* loraA;
+    ggml_tensor* loraB;
+};
+
+struct export_lora_params {
+    std::string fn_model_base;
+    std::string fn_model_out;
+    std::vector<struct lora_info> lora;
+    int n_threads;
+};
+
+static struct export_lora_params get_default_export_lora_params() {
+    struct export_lora_params result;
+    result.fn_model_base = "";
+    result.fn_model_out  = "";
+    result.n_threads = GGML_DEFAULT_N_THREADS;
+    return result;
+}
+
+struct lora_info {
+    std::string filename;
+    float scale;
+};
+// TODO lora_data should maybe sub lora_weights in llama.cpp
+struct lora_data {
+    struct lora_info     info;
+    std::vector<uint8_t> data;
+    struct ggml_context * ctx;
+
+    uint32_t lora_r;
+    uint32_t lora_alpha;
+};
+
+struct llama_file_lora {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file_lora(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            die_fmt("read error: %s", strerror(errno));
+        }
+        if (ret != 1) {
+            die("unexpectedly reached end of file");
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            die_fmt("write error: %s", strerror(errno));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    bool eof() {
+        return tell() >= size;
+    }
+
+    ~llama_file_lora() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+static void free_lora(struct lora_data * lora) {
+    if (lora->ctx != NULL) {
+        ggml_free(lora->ctx);
+    }
+    delete lora;
+}
+
+static struct lora_data * load_lora(struct lora_info * info) {
+    struct lora_data * result = new struct lora_data;
+    result->info = *info;
+    result->ctx = NULL;
+    result->lora_r     = 1;
+    result->lora_alpha = 1;
+
+    struct llama_file_lora file(info->filename.c_str(), "rb");
+    if (file.fp == NULL) {
+        fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
+            info->filename.c_str());
+        free_lora(result);
+        return NULL;
+    }
+
+    struct ggml_init_params params_ggml;
+    params_ggml.mem_size   = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
+    params_ggml.mem_buffer = NULL;
+    params_ggml.no_alloc   = true;
+    result->ctx = ggml_init(params_ggml);
+
+    uint32_t magic   = file.read_u32();
+    if (magic != LLAMA_FILE_MAGIC_GGLA) {
+        die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
+    }
+    uint32_t version = file.read_u32();
+    if (version != 1) {
+        die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
+    }
+    result->lora_r     = file.read_u32();
+    result->lora_alpha = file.read_u32();
+    // read tensor infos from file
+    std::vector<char> name_buf;
+    std::vector<struct ggml_tensor *> tensors;
+    std::vector<size_t> tensors_offset;
+    size_t total_nbytes_pad = 0;
+    while(!file.eof()) {
+        int64_t ne[4]   = {1,1,1,1};
+        uint32_t n_dims  = file.read_u32();
+        uint32_t namelen = file.read_u32();
+        uint32_t type    = file.read_u32();
+        for (uint32_t k = 0; k < n_dims; ++k) {
+            ne[k] = (int64_t)file.read_u32();
+        }
+        name_buf.clear();
+        name_buf.resize(namelen + 1, '\0');
+        file.read_raw(name_buf.data(), namelen);
+        file.seek((0-file.tell()) & 31, SEEK_CUR);
+        size_t offset = file.tell();
+        struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
+        ggml_set_name(tensor, name_buf.data());
+        size_t nbytes     = ggml_nbytes(tensor);
+        size_t nbytes_pad = ggml_nbytes_pad(tensor);
+        total_nbytes_pad += nbytes_pad;
+        tensors.push_back(tensor);
+        tensors_offset.push_back(offset);
+        file.seek(nbytes, SEEK_CUR);
+    }
+    // read tensor data
+    result->data.resize(total_nbytes_pad);
+    size_t data_offset = 0;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        struct ggml_tensor * tensor = tensors[i];
+        size_t offset     = tensors_offset[i];
+        size_t nbytes     = ggml_nbytes(tensor);
+        size_t nbytes_pad = ggml_nbytes_pad(tensor);
+        file.seek(offset, SEEK_SET);
+        tensor->data = result->data.data() + data_offset;
+        file.read_raw(tensor->data, nbytes);
+        data_offset += nbytes_pad;
+    }
+    return result;
+}
+
+///////// LORA
+
 static size_t utf8_len(char src) {
     const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
     uint8_t highbits = static_cast<uint8_t>(src) >> 4;
@@ -2295,6 +2501,10 @@ struct llama_context {
     }
 
     llama_cparams cparams;
+    bool lora_loaded = false;
+    std::map<std::string, lora_weights> lora_weights_map;
+    lora_data llora_data;
+    float lora_scale = 1.0f;
 
     std::vector<ggml_backend_t> backends;
 #ifdef GGML_USE_METAL
@@ -7447,21 +7657,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = lora_mul_mat(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = lora_mul_mat(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = lora_mul_mat(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -9470,6 +9680,35 @@ struct llm_build_context {
         return gf;
     }
 
+    static ggml_tensor * lora_mul_mat(
+        llama_context & lctx, 
+        ggml_context * ctx0, 
+        ggml_tensor * weight, 
+        ggml_tensor * cur) {
+    ggml_tensor * mm = ggml_mul_mat(ctx0, weight, cur);
+
+    auto it = lctx.lora_weights_map.find(weight->name);
+    if (it == lctx.lora_weights_map.end()) {
+        return mm;
+    }
+
+    ggml_tensor * loraA = it->second.loraA;
+    ggml_tensor * loraB = it->second.loraB;
+
+    ggml_tensor * t_lora = ggml_mul_mat(ctx0,
+                ggml_mul_mat(ctx0, loraA, loraB), 
+                cur
+            );
+
+    if (lctx.lora_scale != 1.0f) {
+        t_lora = ggml_scale(ctx0, t_lora, lctx.lora_scale);
+    }
+
+    ggml_tensor * t_patch = ggml_add(ctx0, mm, t_lora);
+    return t_patch;
+
+}
+
     struct ggml_cgraph * build_phi3() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
@@ -16025,6 +16264,29 @@ void llama_free_model(struct llama_model * model) {
     delete model;
 }
 
+
+static std::map<std::string, lora_weights> get_lora_weights_map_cpp(struct ggml_context* ctx) {
+    struct lora_tensor_pair* pair = build_lora_weights_map(ctx);
+    std::map<std::string, lora_weights> map;
+
+    if (pair) {
+        for (int i = 0; i < pair->count; i++) {
+            std::string name(pair->pairs[i].name);
+            std::string base_name = name.substr(0, name.size() - 6);
+            std::string suffix = name.substr(name.size() - 6);
+
+            if (suffix == ".loraA") {
+                map[base_name].loraA = pair->pairs[i].tensor;
+            } else if (suffix == ".loraB") {
+                map[base_name].loraB = pair->pairs[i].tensor;
+            }
+        }
+        free_lora_tensor_pair(pair);
+    }
+
+    return map;
+}
+
 struct llama_context * llama_new_context_with_model(
                  struct llama_model * model,
         struct llama_context_params   params) {
@@ -16056,6 +16318,37 @@ struct llama_context * llama_new_context_with_model(
 
     llama_context * ctx = new llama_context(*model);
 
+    /// LORA
+    struct export_lora_params * lora_params = new struct export_lora_params;
+    struct lora_info lora;
+    lora.filename = "./models/open-llama/lora-ggml-model-q8_0-shakespeare-LATEST.bin";
+    lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras
+    lora_params->lora.push_back(lora);
+    // load all loras
+    std::vector<struct lora_data *> loras;
+    for (size_t i = 0; i < lora_params->lora.size(); ++i) {
+        struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
+        if (llora_data != NULL) {
+            loras.push_back(llora_data);
+        }
+    }
+    if (loras.size() == 0) {
+        fprintf(stderr, "warning: no lora adapters will be applied.\n");
+    }
+    // Assign data 
+    ctx->llora_data = *loras[0];
+
+    // build the map?
+    ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
+    std::vector<std::string> keys;
+    for (const auto& pair : ctx->lora_weights_map) {
+        keys.push_back(pair.first);
+    }
+
+
+
+    /// END LORA
+
     const auto & hparams = model->hparams;
     auto       & cparams = ctx->cparams;
 
diff --git a/llama.h b/llama.h
index 62908261f2791..85a53f1e65819 100644
--- a/llama.h
+++ b/llama.h
@@ -45,6 +45,9 @@
 #define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
 #define LLAMA_STATE_SEQ_VERSION 1
 
+#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
+#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
+
 #ifdef __cplusplus
 extern "C" {
 #endif

From 26df64ad04e377dc24427da0e178cdc67cd86e49 Mon Sep 17 00:00:00 2001
From: ltoniazzi <lt6ga@protonmail.com>
Date: Fri, 21 Jun 2024 17:28:14 +0100
Subject: [PATCH 02/14] Fix passing param

---
 BRANCH_SETUP.md   | 10 +++++----
 common/common.cpp |  4 ++++
 llama.cpp         | 53 +++++++++++++++++++++++++----------------------
 llama.h           |  1 +
 4 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/BRANCH_SETUP.md b/BRANCH_SETUP.md
index dac58d0d2de31..d9f7405b5cab6 100644
--- a/BRANCH_SETUP.md
+++ b/BRANCH_SETUP.md
@@ -30,19 +30,21 @@
 
 Run main with base model and lora adapter to hot-swap
 ```bash
-./main ./models/open-llama/ggml-model-f16.gguf \
---hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \
+./main -m ./models/open-llama/ggml-model-f16.gguf \
+--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \
 -ngl 0 \
 -n 128
 ```
 
-With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (`lora_mul_mat`), but they are not moved to the buffer of the base tensors.
+With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.
 
 # Logic
 
 
 
+
 # Current status
 
-- Only ony Lora adapter can be passed. 
+- Only one Lora adapter can be passed. 
+- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers)
 - GPU not supported
\ No newline at end of file
diff --git a/common/common.cpp b/common/common.cpp
index 494258db0ed48..21003343e4740 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2443,6 +2443,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.n_ubatch          = params.n_ubatch;
     cparams.n_threads         = params.n_threads;
     cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    const char* c_string = params.hot_lora.c_str();
+    strncpy(cparams.hot_lora, c_string, sizeof(cparams.hot_lora) - 1);
+    cparams.hot_lora[sizeof(cparams.hot_lora) - 1] = '\0';  // Ensure null-termination
+
     cparams.seed              = params.seed;
     cparams.logits_all        = params.logits_all;
     cparams.embeddings        = params.embedding;
diff --git a/llama.cpp b/llama.cpp
index 58b6ff8640447..467ab0f29c220 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -145,7 +145,7 @@ struct lora_info {
     std::string filename;
     float scale;
 };
-// TODO lora_data should maybe sub lora_weights in llama.cpp
+// TODO lora_data should maybe sub lora_weights
 struct lora_data {
     struct lora_info     info;
     std::vector<uint8_t> data;
@@ -2502,7 +2502,7 @@ struct llama_context {
 
     llama_cparams cparams;
     bool lora_loaded = false;
-    std::map<std::string, lora_weights> lora_weights_map;
+    std::map<std::string, lora_weights> lora_weights_map; // only one LoRA adapter at the moment
     lora_data llora_data;
     float lora_scale = 1.0f;
 
@@ -16109,6 +16109,7 @@ struct llama_context_params llama_context_default_params() {
         /*.n_seq_max                   =*/ 1,
         /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
         /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
+        /*.hot_lora               =*/ "",
         /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
         /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
         /*.rope_freq_base              =*/ 0.0f,
@@ -16321,33 +16322,35 @@ struct llama_context * llama_new_context_with_model(
     /// LORA
     struct export_lora_params * lora_params = new struct export_lora_params;
     struct lora_info lora;
-    lora.filename = "./models/open-llama/lora-ggml-model-q8_0-shakespeare-LATEST.bin";
-    lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras
-    lora_params->lora.push_back(lora);
-    // load all loras
-    std::vector<struct lora_data *> loras;
-    for (size_t i = 0; i < lora_params->lora.size(); ++i) {
-        struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
-        if (llora_data != NULL) {
-            loras.push_back(llora_data);
+    // lora.filename = "./models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin";
+    lora.filename = params.hot_lora;
+    if (strlen(params.hot_lora) > 0) {
+            
+        lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras?
+        lora_params->lora.push_back(lora);
+        // load all loras
+        std::vector<struct lora_data *> loras;
+        for (size_t i = 0; i < lora_params->lora.size(); ++i) {
+            struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
+            if (llora_data != NULL) {
+                loras.push_back(llora_data);
+            }
         }
-    }
-    if (loras.size() == 0) {
-        fprintf(stderr, "warning: no lora adapters will be applied.\n");
-    }
-    // Assign data 
-    ctx->llora_data = *loras[0];
+        if (loras.size() == 0) {
+            fprintf(stderr, "warning: no lora adapters will be applied.\n");
+        }
+        // Assign data 
+        ctx->llora_data = *loras[0];
 
-    // build the map?
-    ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
-    std::vector<std::string> keys;
-    for (const auto& pair : ctx->lora_weights_map) {
-        keys.push_back(pair.first);
+        // build the map?
+        ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
+        std::vector<std::string> keys;
+        for (const auto& pair : ctx->lora_weights_map) {
+            keys.push_back(pair.first);
+        }
     }
 
-
-
-    /// END LORA
+    /// LORA
 
     const auto & hparams = model->hparams;
     auto       & cparams = ctx->cparams;
diff --git a/llama.h b/llama.h
index 85a53f1e65819..d593eb45c9dab 100644
--- a/llama.h
+++ b/llama.h
@@ -292,6 +292,7 @@ extern "C" {
         uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
         uint32_t n_threads;         // number of threads to use for generation
         uint32_t n_threads_batch;   // number of threads to use for batch processing
+        char hot_lora[256];    // path to the hot lora file
 
         enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
         enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id

From 5c4ba81933f417ce9378572513057dd4db6feebf Mon Sep 17 00:00:00 2001
From: ltoniazzi <lt6ga@protonmail.com>
Date: Fri, 21 Jun 2024 18:00:12 +0100
Subject: [PATCH 03/14] Remove comment

---
 llama.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 467ab0f29c220..15e83b0c45cb2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -16322,7 +16322,6 @@ struct llama_context * llama_new_context_with_model(
     /// LORA
     struct export_lora_params * lora_params = new struct export_lora_params;
     struct lora_info lora;
-    // lora.filename = "./models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin";
     lora.filename = params.hot_lora;
     if (strlen(params.hot_lora) > 0) {
             

From 028d3f7c8977ef15c88966b1998ae2bcdf13f3f0 Mon Sep 17 00:00:00 2001
From: Lorenzo Toniazzi <lorenzo.toniazzi@stepstone.com>
Date: Mon, 1 Jul 2024 22:16:11 +0100
Subject: [PATCH 04/14] Metal running (still buffer issues)

---
 BRANCH_SETUP.md        | 254 ++++++++++++++++++++++++++++++++++++++++-
 examples/main/main.cpp |  67 +++++++++++
 llama.cpp              | 103 +++++++++++++++--
 3 files changed, 414 insertions(+), 10 deletions(-)

diff --git a/BRANCH_SETUP.md b/BRANCH_SETUP.md
index d9f7405b5cab6..0b6cdac74a45e 100644
--- a/BRANCH_SETUP.md
+++ b/BRANCH_SETUP.md
@@ -36,6 +36,8 @@ Run main with base model and lora adapter to hot-swap
 -n 128
 ```
 
+Working but `ggml_metal_get_buffer: error: tensor 'blk.16.attn_v.weight.loraB' buffer is nil`
+
 With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.
 
 # Logic
@@ -47,4 +49,254 @@ With `ngl > 0` the code breaks. Probably because the Lora tensors try to interac
 
 - Only one Lora adapter can be passed. 
 - Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers)
-- GPU not supported
\ No newline at end of file
+- GPU not supported
+
+
+
+
+# Tutorial
+
+```cpp
+#include "llama.h"
+
+#include "unicode.h"
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+
+#ifdef GGML_USE_RPC
+#  include "ggml-rpc.h"
+#endif
+
+#ifdef GGML_USE_CUDA
+#  include "ggml-cuda.h"
+#elif defined(GGML_USE_VULKAN)
+#  include "ggml-vulkan.h"
+#elif defined(GGML_USE_SYCL)
+#  include "ggml-sycl.h"
+#elif defined(GGML_USE_KOMPUTE)
+#   include "ggml-kompute.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#  include "ggml-metal.h"
+#endif
+
+// TODO: replace with ggml API call
+#define QK_K 256
+
+#ifdef __has_include
+    #if __has_include(<unistd.h>)
+        #include <unistd.h>
+        #if defined(_POSIX_MAPPED_FILES)
+            #include <sys/mman.h>
+            #include <fcntl.h>
+        #endif
+        #if defined(_POSIX_MEMLOCK_RANGE)
+            #include <sys/resource.h>
+        #endif
+    #endif
+#endif
+
+#if defined(_WIN32)
+    #define WIN32_LEAN_AND_MEAN
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+    #ifndef PATH_MAX
+        #define PATH_MAX MAX_PATH
+    #endif
+    #include <io.h>
+#endif
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cctype>
+#include <cfloat>
+#include <cinttypes>
+#include <climits>
+#include <cmath>
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <forward_list>
+#include <fstream>
+#include <functional>
+#include <future>
+#include <initializer_list>
+#include <locale>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <queue>
+#include <random>
+#include <regex>
+#include <set>
+#include <sstream>
+#include <thread>
+#include <type_traits>
+#include <unordered_map>
+#include "ggml-metal.h"
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_ATTRIBUTE_FORMAT(...)
+#endif
+
+#define LLAMA_MAX_NODES   8192
+#define LLAMA_MAX_EXPERTS 160
+
+  
+int main() {
+    struct ggml_init_params params = {
+        .mem_size   = 16*1024*1024,
+        .mem_buffer = NULL,
+        /*.no_alloc   =*/ true,
+    };
+
+    // The library allows the user to define a certain function using the available tensor operations. This function
+    // definition is represented internally via a computation graph. Each tensor operation in the function definition
+    // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
+    // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
+    // using one of the available optimization algorithms.
+    //
+    // For example, here we define the function: f(x) = a*x^2 + b    
+
+    // memory allocation happens here
+    // Create context allogating memory
+    struct ggml_context * ctx = ggml_init(params);
+
+    struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+
+    ggml_set_param(ctx, x); // x is an input variable
+
+    struct ggml_tensor * a  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+    struct ggml_tensor * b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+    struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
+    struct ggml_tensor * f  = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
+
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+
+    // ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
+    // ggml_backend_alloc_ctx_tensors_from_buft(ctx,  ggml_backend_metal_buffer_type());
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
+            if (buf == nullptr) {
+                throw std::runtime_error("unable to allocate backend buffer");
+            }
+    ggml_used_mem(ctx);
+
+    // llama_default_buffer_type_offload(model, layer_gpu); used in llama.cpp
+    // How to check which buffer is the context allocated, 
+    // can look at single tensors? option, check in inited in base model
+
+    // Try this
+    // You can simplify all of this for testing, and if you are using CPU only, and just run with -ngl 0 
+    // and allocate everything in a CPU buffer by using 
+    //  ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
+    // or run with -ngl 99 and use a Metal buffer type instead with 
+    //  ggml_backend_metal_buffer_type()
+    // It will still run if you allocate the tensors in the wrong buffer type as long as you use ggml-backend 
+    // to allocate the tensors, it will just be slower.
+
+    // Notice that the function definition above does not involve any actual computation. The computation is performed only
+    // when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
+
+
+    ggml_build_forward_expand(gf, f);
+
+    // set the input variable and parameter values
+    ggml_set_f32(x, 2.0f);
+    ggml_set_f32(a, 3.0f);
+    ggml_set_f32(b, 4.0f);
+
+    ggml_graph_compute_with_ctx(ctx, gf, 1);
+
+    printf("f = %f\n", ggml_get_f32_1d(f, 0));
+
+    // The actual computation is performed in the ggml_graph_compute() function.
+    //
+    // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
+    // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
+    // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
+    // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
+    // actually needed.
+    //
+    // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
+    // differentiation and optimization algorithms.
+    //
+    // The described approach allows to define the function graph once and then compute its forward or backward graphs
+    // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
+    // the user can avoid the memory allocation overhead at runtime.
+    //
+    // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
+    // citizens, but in theory the library can be extended to support FP8 and integer data types.
+    //
+    // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
+    // and binary operations. Most of the available operations fall into one of these two categories. With time, it became
+    // clear that the library needs to support more complex operations. The way to support these operations is not clear
+    // yet, but a few examples are demonstrated in the following operations:
+    //
+    //   - ggml_permute()
+    //   - ggml_conv_1d_1s()
+    //   - ggml_conv_1d_2s()
+    //
+    // For each tensor operator, the library implements a forward and backward computation function. The forward function
+    // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
+    // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
+    // calculus class, or watch the following video:
+    //
+    //   What is Automatic Differentiation?
+    //   https://www.youtube.com/watch?v=wG_nF1awSSY
+
+    // ## Tensor data (struct ggml_tensor)
+    //
+    // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
+    // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
+    // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
+    
+    struct ggml_tensor * c = ggml_add(ctx, a, b);
+
+    assert(c->src[0] == a);
+    assert(c->src[1] == b);
+
+    // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
+    // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
+    // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
+    // permutation. All tensor operations have to take the stride into account and not assume that the tensor is
+    // contiguous in memory.
+    
+    // The data of the tensor is accessed via the "data" pointer. For example:
+
+    const int nx = 2;
+    const int ny = 3;
+
+    struct ggml_tensor * A = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
+
+    for (int y = 0; y < ny; y++) {
+        for (int x = 0; x < nx; x++) {
+            *(float *) ((char *) A->data + y*A->nb[1] + x*A->nb[0]) = x + y;
+        }
+    }
+
+    //
+    // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
+    //
+
+  }
+  ```
\ No newline at end of file
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index b97b7b7937f02..bdcf6f998c2d9 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -117,7 +117,74 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
     LOG_TEE("%s", text);
 }
 
+#include "ggml-metal.h"
+
+bool is_pointer_in_buffer_range(void *ptr, void *buffer_start, size_t buffer_size) {
+    return (ptr >= (char*)buffer_start) && (ptr < ((char*)buffer_start + buffer_size));
+}
+
+
+void verify_tensor_allocation(struct ggml_context * ctx, ggml_backend_buffer_t buffer, size_t buffer_size) {
+    struct ggml_tensor * first = ggml_get_first_tensor(ctx);
+    for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        if (t->data != NULL) {
+            if (!is_pointer_in_buffer_range(t->data, buffer, buffer_size)) {
+                fprintf(stderr, "Tensor %s is not within the allocated buffer range.\n", t->name);
+            } else {
+                printf("Tensor %s is correctly allocated in the buffer.\n", t->name);
+            }
+        }
+    }
+}
+
 int main(int argc, char ** argv) {
+
+
+    // The library allows the user to define a certain function using the available tensor operations. This function
+    // definition is represented internally via a computation graph. Each tensor operation in the function definition
+    // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
+    // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
+    // using one of the available optimization algorithms.
+    //
+    // For example, here we define the function: f(x) = a*x^2 + b    
+
+    // memory allocation happens here
+    // Create context allogating memory
+    struct ggml_init_params _params = {
+        .mem_size   = 16*1024*1024,
+        .mem_buffer = NULL,
+        .no_alloc   = true,
+    };
+    struct ggml_context * _ctx = ggml_init(_params);
+
+    struct ggml_tensor * x = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
+
+    // ggml_set_param(_ctx, x); // x is an input variable
+
+    // struct ggml_tensor * a  = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
+    // struct ggml_tensor * b  = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
+    // struct ggml_tensor * x2 = ggml_mul(_ctx, x, x);
+    // struct ggml_tensor * f  = ggml_add(_ctx, ggml_mul(_ctx, a, x2), b);
+
+    // struct ggml_cgraph * gf = ggml_new_graph(_ctx);
+
+    // // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_cpu_buffer_type());
+    // // ggml_backend_alloc_ctx_tensors_from_buft(_ctx,  ggml_backend_metal_buffer_type());
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type());
+            if (buf == nullptr) {
+                throw std::runtime_error("unable to allocate backend buffer");
+            }
+            else {
+                size_t buffer_size = ggml_backend_buft_get_max_size(ggml_backend_metal_buffer_type());
+
+                // Verify tensor allocations
+                verify_tensor_allocation(_ctx, buf, buffer_size);
+            }
+    ggml_used_mem(_ctx);
+    // 
+
+
+
     gpt_params params;
     g_params = &params;
 
diff --git a/llama.cpp b/llama.cpp
index 15e83b0c45cb2..744e4f8c3fb64 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -307,6 +307,11 @@ static struct lora_data * load_lora(struct lora_info * info) {
         tensors_offset.push_back(offset);
         file.seek(nbytes, SEEK_CUR);
     }
+
+    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx,  ggml_backend_metal_buffer_type());
+        if (!buf) {
+            LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__);
+        }
     // read tensor data
     result->data.resize(total_nbytes_pad);
     size_t data_offset = 0;
@@ -3922,7 +3927,7 @@ struct llama_model_loader {
 
         std::vector<no_init<uint8_t>> read_buf;
         std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
-
+        // Allocate tensors data to buffer
         for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
             const auto * weight = get_weight(ggml_get_name(cur));
             if (weight == nullptr) {
@@ -3951,7 +3956,7 @@ struct llama_model_loader {
                         return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
                     }));
                 }
-
+                // TODO LORA allocation of base tensors
                 GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
                 if (buf_mmap && cur->data == nullptr) {
                     ggml_backend_tensor_alloc(buf_mmap, cur, data);
@@ -5392,7 +5397,7 @@ static bool llm_load_tensors(
         auto ctx_for_layer_split        = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
 
         model.layers.resize(n_layer);
-
+        // main players model, ml, ctx_input/output, tn (gets name?)
         const auto tn = LLM_TN(model.arch);
         switch (model.arch) {
             case LLM_ARCH_LLAMA:
@@ -6666,7 +6671,7 @@ static bool llm_load_tensors(
 #endif
             }
         }
-#ifdef GGML_USE_METAL
+#ifdef GGML_USE_METAL // LORA Use metal on base tensors
         else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
             for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                 const size_t max_size = ggml_get_max_tensor_size(ctx);
@@ -16341,12 +16346,92 @@ struct llama_context * llama_new_context_with_model(
         // Assign data 
         ctx->llora_data = *loras[0];
 
-        // build the map?
-        ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
-        std::vector<std::string> keys;
-        for (const auto& pair : ctx->lora_weights_map) {
-            keys.push_back(pair.first);
+
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx,  ggml_backend_metal_buffer_type());
+        if (!buf) {
+            LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__);
         }
+        // Looks this worked, need to check if tensors have new buffer (not sure below).
+        // Also do we need to set the tensors? not clear where data is, looks like it is loaded after the
+        //  tensor creation in context, but loaded where? cuz if data present dfferebt way to set with ggml_backend_tensor_set instead of ggml_backend_tensor_alloc
+
+        // TODO looks like I have already a context with load_lora, understand if 
+        // I am using it
+        // If the contexg it set to right buffer with ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx,  ggml_backend_metal_buffer_type());
+        // As I should already have created the tensors in the context, 
+        // Understand where are the weights loaded instead
+        // Load the weight/data in the context
+        // Maybe check finetuning approach at managing the lora weights.
+        
+
+
+        // build the map? TODO LORA ctx->lora_weights_map layers seem to not have buffer type but it should as the simple example does
+        ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
+        // std::vector<std::string> keys;
+        // for (const auto& pair : ctx->lora_weights_map) {
+        //     keys.push_back(pair.first);
+
+        //     ggml_tensor * tensorA = pair.second.loraA;
+        //     ggml_tensor * tensorB = pair.second.loraB;
+
+        //     ggml_tensor * tensorA_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorA->type, 4, tensorA->ne);
+        //     ggml_tensor * tensorB_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorB->type, 4, tensorB->ne);
+
+        // }
+        
+        // for (struct ggml_tensor * cur = ggml_get_first_tensor((ctx->llora_data).ctx); cur != NULL; cur = ggml_get_next_tensor((ctx->llora_data).ctx, cur)) {
+        //     const auto * name = ggml_get_name(cur);
+        //     // ggml_backend_tensor_set(tensorA, tensorA->data, 0, ggml_nbytes(tensorA));
+        //     // ggml_backend_tensor_set(tensorB, tensorB->data, 0, ggml_nbytes(tensorB));
+
+        // }
+        
+            // for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
+            // const auto * weight = get_weight(ggml_get_name(cur));
+            // if (weight == nullptr) {
+            //     // this can happen with split experts models
+            //     continue;
+            // }
+
+            // if (progress_callback) {
+            //     if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
+            //         return false;
+            //     }
+            // }
+
+            // size_t n_size = ggml_nbytes(cur);
+
+            // if (use_mmap) {
+            //     const auto & mapping = mappings.at(weight->idx);
+            //     ggml_backend_buffer_t buf_mmap = nullptr;
+            //     if (bufs_mmap.count(weight->idx)) {
+            //         buf_mmap = bufs_mmap.at(weight->idx);
+            //     }
+            //     uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
+
+            //     if (check_tensors) {
+            //         validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
+            //             return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
+            //         }));
+            //     }
+            //     // TODO LORA allocation of base tensors
+            //     GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
+            //     if (buf_mmap && cur->data == nullptr) {
+            //         ggml_backend_tensor_alloc(buf_mmap, cur, data);
+            //         if (lmlocks) {
+            //             const auto & lmlock = lmlocks->at(weight->idx);
+            //             lmlock->grow_to(weight->offs + n_size);
+            //         }
+
+            //         auto & mmap_used = mmaps_used[weight->idx];
+            //         mmap_used.first  = std::min(mmap_used.first,  weight->offs);
+            //         mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
+            //     } else {
+            //         ggml_backend_tensor_set(cur, data, 0, n_size);
+
+
+
+
     }
 
     /// LORA

From 1103bdb57476b65404221f87b37ed2f91ffd4492 Mon Sep 17 00:00:00 2001
From: Lorenzo Toniazzi <lorenzo.toniazzi@stepstone.com>
Date: Tue, 2 Jul 2024 21:59:54 +0100
Subject: [PATCH 05/14] Fixed buffer allocation

---
 examples/main/main.cpp | 46 ------------------------------------------
 llama.cpp              | 30 ++++++++++++++++++++++-----
 2 files changed, 25 insertions(+), 51 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index bdcf6f998c2d9..5e9e4001de1b5 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -139,52 +139,6 @@ void verify_tensor_allocation(struct ggml_context * ctx, ggml_backend_buffer_t b
 
 int main(int argc, char ** argv) {
 
-
-    // The library allows the user to define a certain function using the available tensor operations. This function
-    // definition is represented internally via a computation graph. Each tensor operation in the function definition
-    // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
-    // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
-    // using one of the available optimization algorithms.
-    //
-    // For example, here we define the function: f(x) = a*x^2 + b    
-
-    // memory allocation happens here
-    // Create context allogating memory
-    struct ggml_init_params _params = {
-        .mem_size   = 16*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = true,
-    };
-    struct ggml_context * _ctx = ggml_init(_params);
-
-    struct ggml_tensor * x = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
-
-    // ggml_set_param(_ctx, x); // x is an input variable
-
-    // struct ggml_tensor * a  = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
-    // struct ggml_tensor * b  = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
-    // struct ggml_tensor * x2 = ggml_mul(_ctx, x, x);
-    // struct ggml_tensor * f  = ggml_add(_ctx, ggml_mul(_ctx, a, x2), b);
-
-    // struct ggml_cgraph * gf = ggml_new_graph(_ctx);
-
-    // // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_cpu_buffer_type());
-    // // ggml_backend_alloc_ctx_tensors_from_buft(_ctx,  ggml_backend_metal_buffer_type());
-    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type());
-            if (buf == nullptr) {
-                throw std::runtime_error("unable to allocate backend buffer");
-            }
-            else {
-                size_t buffer_size = ggml_backend_buft_get_max_size(ggml_backend_metal_buffer_type());
-
-                // Verify tensor allocations
-                verify_tensor_allocation(_ctx, buf, buffer_size);
-            }
-    ggml_used_mem(_ctx);
-    // 
-
-
-
     gpt_params params;
     g_params = &params;
 
diff --git a/llama.cpp b/llama.cpp
index 744e4f8c3fb64..cd4b43e945e8e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -150,6 +150,11 @@ struct lora_data {
     struct lora_info     info;
     std::vector<uint8_t> data;
     struct ggml_context * ctx;
+    // the backend to perform the computation (CPU, CUDA, METAL)
+    ggml_backend_t backend = NULL;
+
+    // the backend buffer to storage the tensors data of a and b
+    ggml_backend_buffer_t buffer;
 
     uint32_t lora_r;
     uint32_t lora_alpha;
@@ -253,9 +258,17 @@ static struct lora_data * load_lora(struct lora_info * info) {
     struct lora_data * result = new struct lora_data;
     result->info = *info;
     result->ctx = NULL;
+    result->backend = NULL;
+    result->buffer = NULL;
     result->lora_r     = 1;
     result->lora_alpha = 1;
 
+    fprintf(stderr, "%s: using Metal backend\n", __func__);
+    result->backend = ggml_backend_metal_init();
+    if (!result->backend) {
+        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+    }
+
     struct llama_file_lora file(info->filename.c_str(), "rb");
     if (file.fp == NULL) {
         fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
@@ -307,9 +320,10 @@ static struct lora_data * load_lora(struct lora_info * info) {
         tensors_offset.push_back(offset);
         file.seek(nbytes, SEEK_CUR);
     }
+    result->buffer = ggml_backend_alloc_ctx_tensors(result->ctx, result->backend);
 
-    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx,  ggml_backend_metal_buffer_type());
-        if (!buf) {
+    // ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx,  ggml_backend_metal_buffer_type());
+        if (!result->buffer) {
             LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__);
         }
     // read tensor data
@@ -321,9 +335,15 @@ static struct lora_data * load_lora(struct lora_info * info) {
         size_t nbytes     = ggml_nbytes(tensor);
         size_t nbytes_pad = ggml_nbytes_pad(tensor);
         file.seek(offset, SEEK_SET);
-        tensor->data = result->data.data() + data_offset;
-        file.read_raw(tensor->data, nbytes);
-        data_offset += nbytes_pad;
+
+        std::vector<char> read_buf;
+        read_buf.resize(ggml_nbytes(tensor));
+        file.read_raw(read_buf.data(), ggml_nbytes(tensor));
+        ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
+        // tensor_tmp->data = result->data.data() + data_offset;
+        // file.read_raw(tensor_tmp->data, nbytes);
+        // data_offset += nbytes_pad;
+        // ggml_backend_tensor_set(tensor, tensor_tmp->data, 0, ggml_nbytes(tensor));
     }
     return result;
 }

From 1734f3f0f842791abeaeb391e1324b50be120eb9 Mon Sep 17 00:00:00 2001
From: Lorenzo Toniazzi <lorenzo.toniazzi@stepstone.com>
Date: Tue, 2 Jul 2024 22:18:35 +0100
Subject: [PATCH 06/14] Clean up

---
 examples/main/main.cpp |  19 -------
 llama.cpp              | 115 +++++++++--------------------------------
 2 files changed, 25 insertions(+), 109 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 5e9e4001de1b5..ba76a496b5999 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -117,25 +117,6 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
     LOG_TEE("%s", text);
 }
 
-#include "ggml-metal.h"
-
-bool is_pointer_in_buffer_range(void *ptr, void *buffer_start, size_t buffer_size) {
-    return (ptr >= (char*)buffer_start) && (ptr < ((char*)buffer_start + buffer_size));
-}
-
-
-void verify_tensor_allocation(struct ggml_context * ctx, ggml_backend_buffer_t buffer, size_t buffer_size) {
-    struct ggml_tensor * first = ggml_get_first_tensor(ctx);
-    for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        if (t->data != NULL) {
-            if (!is_pointer_in_buffer_range(t->data, buffer, buffer_size)) {
-                fprintf(stderr, "Tensor %s is not within the allocated buffer range.\n", t->name);
-            } else {
-                printf("Tensor %s is correctly allocated in the buffer.\n", t->name);
-            }
-        }
-    }
-}
 
 int main(int argc, char ** argv) {
 
diff --git a/llama.cpp b/llama.cpp
index cd4b43e945e8e..fe842e4d82b58 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -263,11 +263,27 @@ static struct lora_data * load_lora(struct lora_info * info) {
     result->lora_r     = 1;
     result->lora_alpha = 1;
 
+#ifdef GGML_USE_CUDA
+    fprintf(stderr, "%s: using CUDA backend\n", __func__);
+    result->backend = ggml_backend_cuda_init(0); // init device 0
+    if (!result->backend) {
+        fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+    }
+#endif
+
+#ifdef GGML_USE_METAL
     fprintf(stderr, "%s: using Metal backend\n", __func__);
     result->backend = ggml_backend_metal_init();
     if (!result->backend) {
         fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
     }
+#endif
+
+    // if there aren't GPU Backends fallback to CPU backend
+    if (!result->backend) {
+        result->backend = ggml_backend_cpu_init();
+    }
+
 
     struct llama_file_lora file(info->filename.c_str(), "rb");
     if (file.fp == NULL) {
@@ -320,30 +336,24 @@ static struct lora_data * load_lora(struct lora_info * info) {
         tensors_offset.push_back(offset);
         file.seek(nbytes, SEEK_CUR);
     }
-    result->buffer = ggml_backend_alloc_ctx_tensors(result->ctx, result->backend);
 
-    // ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx,  ggml_backend_metal_buffer_type());
+
+    
+    result->buffer = ggml_backend_alloc_ctx_tensors(result->ctx, result->backend);
         if (!result->buffer) {
             LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__);
         }
     // read tensor data
     result->data.resize(total_nbytes_pad);
-    size_t data_offset = 0;
     for (size_t i = 0; i < tensors.size(); ++i) {
         struct ggml_tensor * tensor = tensors[i];
         size_t offset     = tensors_offset[i];
         size_t nbytes     = ggml_nbytes(tensor);
-        size_t nbytes_pad = ggml_nbytes_pad(tensor);
         file.seek(offset, SEEK_SET);
-
         std::vector<char> read_buf;
-        read_buf.resize(ggml_nbytes(tensor));
-        file.read_raw(read_buf.data(), ggml_nbytes(tensor));
-        ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
-        // tensor_tmp->data = result->data.data() + data_offset;
-        // file.read_raw(tensor_tmp->data, nbytes);
-        // data_offset += nbytes_pad;
-        // ggml_backend_tensor_set(tensor, tensor_tmp->data, 0, ggml_nbytes(tensor));
+        read_buf.resize(nbytes);
+        file.read_raw(read_buf.data(), nbytes);
+        ggml_backend_tensor_set(tensor, read_buf.data(), 0, nbytes);
     }
     return result;
 }
@@ -16344,7 +16354,7 @@ struct llama_context * llama_new_context_with_model(
 
     llama_context * ctx = new llama_context(*model);
 
-    /// LORA
+    /// LORA load start
     struct export_lora_params * lora_params = new struct export_lora_params;
     struct lora_info lora;
     lora.filename = params.hot_lora;
@@ -16365,27 +16375,6 @@ struct llama_context * llama_new_context_with_model(
         }
         // Assign data 
         ctx->llora_data = *loras[0];
-
-
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx,  ggml_backend_metal_buffer_type());
-        if (!buf) {
-            LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__);
-        }
-        // Looks this worked, need to check if tensors have new buffer (not sure below).
-        // Also do we need to set the tensors? not clear where data is, looks like it is loaded after the
-        //  tensor creation in context, but loaded where? cuz if data present dfferebt way to set with ggml_backend_tensor_set instead of ggml_backend_tensor_alloc
-
-        // TODO looks like I have already a context with load_lora, understand if 
-        // I am using it
-        // If the contexg it set to right buffer with ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx,  ggml_backend_metal_buffer_type());
-        // As I should already have created the tensors in the context, 
-        // Understand where are the weights loaded instead
-        // Load the weight/data in the context
-        // Maybe check finetuning approach at managing the lora weights.
-        
-
-
-        // build the map? TODO LORA ctx->lora_weights_map layers seem to not have buffer type but it should as the simple example does
         ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
         // std::vector<std::string> keys;
         // for (const auto& pair : ctx->lora_weights_map) {
@@ -16398,63 +16387,9 @@ struct llama_context * llama_new_context_with_model(
         //     ggml_tensor * tensorB_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorB->type, 4, tensorB->ne);
 
         // }
-        
-        // for (struct ggml_tensor * cur = ggml_get_first_tensor((ctx->llora_data).ctx); cur != NULL; cur = ggml_get_next_tensor((ctx->llora_data).ctx, cur)) {
-        //     const auto * name = ggml_get_name(cur);
-        //     // ggml_backend_tensor_set(tensorA, tensorA->data, 0, ggml_nbytes(tensorA));
-        //     // ggml_backend_tensor_set(tensorB, tensorB->data, 0, ggml_nbytes(tensorB));
+    }
 
-        // }
-        
-            // for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
-            // const auto * weight = get_weight(ggml_get_name(cur));
-            // if (weight == nullptr) {
-            //     // this can happen with split experts models
-            //     continue;
-            // }
-
-            // if (progress_callback) {
-            //     if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
-            //         return false;
-            //     }
-            // }
-
-            // size_t n_size = ggml_nbytes(cur);
-
-            // if (use_mmap) {
-            //     const auto & mapping = mappings.at(weight->idx);
-            //     ggml_backend_buffer_t buf_mmap = nullptr;
-            //     if (bufs_mmap.count(weight->idx)) {
-            //         buf_mmap = bufs_mmap.at(weight->idx);
-            //     }
-            //     uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
-
-            //     if (check_tensors) {
-            //         validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
-            //             return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
-            //         }));
-            //     }
-            //     // TODO LORA allocation of base tensors
-            //     GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
-            //     if (buf_mmap && cur->data == nullptr) {
-            //         ggml_backend_tensor_alloc(buf_mmap, cur, data);
-            //         if (lmlocks) {
-            //             const auto & lmlock = lmlocks->at(weight->idx);
-            //             lmlock->grow_to(weight->offs + n_size);
-            //         }
-
-            //         auto & mmap_used = mmaps_used[weight->idx];
-            //         mmap_used.first  = std::min(mmap_used.first,  weight->offs);
-            //         mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
-            //     } else {
-            //         ggml_backend_tensor_set(cur, data, 0, n_size);
-
-
-
-
-    }
-
-    /// LORA
+    /// LORA load end
 
     const auto & hparams = model->hparams;
     auto       & cparams = ctx->cparams;

From 284e665a4bf209fa583f805aae9d12c9e14979df Mon Sep 17 00:00:00 2001
From: Lorenzo Toniazzi <lorenzo.toniazzi@stepstone.com>
Date: Tue, 2 Jul 2024 22:29:49 +0100
Subject: [PATCH 07/14] Clean up

---
 llama.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index fe842e4d82b58..eeca784b9e777 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3986,7 +3986,7 @@ struct llama_model_loader {
                         return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
                     }));
                 }
-                // TODO LORA allocation of base tensors
+
                 GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
                 if (buf_mmap && cur->data == nullptr) {
                     ggml_backend_tensor_alloc(buf_mmap, cur, data);
@@ -5427,7 +5427,7 @@ static bool llm_load_tensors(
         auto ctx_for_layer_split        = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
 
         model.layers.resize(n_layer);
-        // main players model, ml, ctx_input/output, tn (gets name?)
+
         const auto tn = LLM_TN(model.arch);
         switch (model.arch) {
             case LLM_ARCH_LLAMA:
@@ -6701,7 +6701,7 @@ static bool llm_load_tensors(
 #endif
             }
         }
-#ifdef GGML_USE_METAL // LORA Use metal on base tensors
+#ifdef GGML_USE_METAL
         else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
             for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                 const size_t max_size = ggml_get_max_tensor_size(ctx);

From 8f0272c9d716a8938b214c6ac6d68533ab8066af Mon Sep 17 00:00:00 2001
From: Lorenzo Toniazzi <lorenzo.toniazzi@stepstone.com>
Date: Sat, 6 Jul 2024 21:19:52 +0100
Subject: [PATCH 08/14] update branch notes

---
 BRANCH_SETUP.md => _BRANCH_SETUP.md | 55 +++++++++++++++++++++++++----
 llama.cpp                           |  7 ++--
 2 files changed, 54 insertions(+), 8 deletions(-)
 rename BRANCH_SETUP.md => _BRANCH_SETUP.md (89%)

diff --git a/BRANCH_SETUP.md b/_BRANCH_SETUP.md
similarity index 89%
rename from BRANCH_SETUP.md
rename to _BRANCH_SETUP.md
index 0b6cdac74a45e..b2d5ab6af6d59 100644
--- a/BRANCH_SETUP.md
+++ b/_BRANCH_SETUP.md
@@ -32,13 +32,14 @@ Run main with base model and lora adapter to hot-swap
 ```bash
 ./main -m ./models/open-llama/ggml-model-f16.gguf \
 --hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \
--ngl 0 \
+-ngl 99 \
+-n 128
+```
+```bash
+./main -m ./models/open-llama/ggml-model-f16.gguf \
+-ngl 99 \
 -n 128
 ```
-
-Working but `ggml_metal_get_buffer: error: tensor 'blk.16.attn_v.weight.loraB' buffer is nil`
-
-With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.
 
 # Logic
 
@@ -299,4 +300,46 @@ int main() {
     //
 
   }
-  ```
\ No newline at end of file
+  ```
+
+
+
+    ```bash
+    # Convert base model to gguf
+    python3 convert-hf-to-gguf.py models/open-llama/ && \
+    # Quantize base model
+    ./quantize ./models/open-llama/ggml-model-f16.gguf ./models/open-llama/ggml-model-q4.gguf Q4_K && \
+    # Obtain Lora adapter
+    ./finetune  --model-base models/open-llama/ggml-model-q4.gguf \
+    --checkpoint-in models/open-llama/chk-lora-ggml-model-q4-hot-lora-LATEST.gguf \
+    --checkpoint-out models/open-llama/chk-lora-ggml-model-q4-hot-lora-ITERATION.gguf \
+    --lora-out models/open-llama/lora-ggml-model-q4-hot-lora-ITERATION.bin \
+    --train-data "data/hot-lora.txt" \
+    --save-every 1 \
+    --threads 1 \
+    --adam-iter 1 \
+    --batch 1 \
+    --ctx 16 \
+    --use-checkpointing
+    ```
+
+</details>
+
+## 1. Run main with adapter
+
+- Run main with base model and lora adapter to hot-swap
+    ```bash
+  ./main -m ./models/open-llama/ggml-model-q4.gguf \
+  --hot-lora models/open-llama/lora-ggml-model-q4-hot-lora-LATEST.bin \
+  -ngl 99 \
+  -n 128
+  ```
+
+- Do not pass the flag `--hot-lora` and the adapter is ignored:
+  ```bash
+  ./main -m ./models/open-llama/ggml-model-q4.gguf \
+  -ngl 99 \
+  -n 128
+  ```
+
+  make clean && make -j 8 LLAMA_DEBUG=1
\ No newline at end of file
diff --git a/llama.cpp b/llama.cpp
index eeca784b9e777..df098b652ba6b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9731,8 +9731,11 @@ struct llm_build_context {
     ggml_tensor * loraB = it->second.loraB;
 
     ggml_tensor * t_lora = ggml_mul_mat(ctx0,
-                ggml_mul_mat(ctx0, loraA, loraB), 
-                cur
+                loraA,
+                ggml_mul_mat(ctx0, 
+                    ggml_transpose(ctx0, loraB), 
+                    cur
+                )
             );
 
     if (lctx.lora_scale != 1.0f) {

From 798cde72a187e35f8a72ce68bddd8c518622ce76 Mon Sep 17 00:00:00 2001
From: Lorenzo Toniazzi <lorenzo.toniazzi@stepstone.com>
Date: Sat, 6 Jul 2024 21:40:22 +0100
Subject: [PATCH 09/14] transpose and run cont

---
 _BRANCH_SETUP.md | 5 ++++-
 llama.cpp        | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/_BRANCH_SETUP.md b/_BRANCH_SETUP.md
index b2d5ab6af6d59..7cdb8ae6a67e2 100644
--- a/_BRANCH_SETUP.md
+++ b/_BRANCH_SETUP.md
@@ -342,4 +342,7 @@ int main() {
   -n 128
   ```
 
-  make clean && make -j 8 LLAMA_DEBUG=1
\ No newline at end of file
+build for debug:
+```bash
+  make clean && make -j 8 LLAMA_DEBUG=1
+```
\ No newline at end of file
diff --git a/llama.cpp b/llama.cpp
index df098b652ba6b..1f3e127fbe408 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9731,9 +9731,9 @@ struct llm_build_context {
     ggml_tensor * loraB = it->second.loraB;
 
     ggml_tensor * t_lora = ggml_mul_mat(ctx0,
-                loraA,
+                loraB,
                 ggml_mul_mat(ctx0, 
-                    ggml_transpose(ctx0, loraB), 
+                    ggml_cont(ctx0, ggml_transpose(ctx0, loraA)),
                     cur
                 )
             );

From 931134b536d6e79c463d10b5da6f39d9d0891214 Mon Sep 17 00:00:00 2001
From: Lorenzo Toniazzi <lorenzo.toniazzi@stepstone.com>
Date: Sat, 6 Jul 2024 22:59:15 +0100
Subject: [PATCH 10/14] transpose when loading

---
 llama.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 1f3e127fbe408..31baf6c6cccca 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -328,6 +328,12 @@ static struct lora_data * load_lora(struct lora_info * info) {
         file.seek((0-file.tell()) & 31, SEEK_CUR);
         size_t offset = file.tell();
         struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
+        // Transpose lora matrix A
+        if (std::string(name_buf.data()).find("loraA") != std::string::npos) {
+            tensor = ggml_cont(result->ctx,
+                ggml_transpose(result->ctx, tensor)
+            );
+        }
         ggml_set_name(tensor, name_buf.data());
         size_t nbytes     = ggml_nbytes(tensor);
         size_t nbytes_pad = ggml_nbytes_pad(tensor);
@@ -9732,10 +9738,7 @@ struct llm_build_context {
 
     ggml_tensor * t_lora = ggml_mul_mat(ctx0,
                 loraB,
-                ggml_mul_mat(ctx0, 
-                    ggml_cont(ctx0, ggml_transpose(ctx0, loraA)),
-                    cur
-                )
+                ggml_mul_mat(ctx0, loraA, cur)
             );
 
     if (lctx.lora_scale != 1.0f) {

From 41e8c733f6fd3c1d1750b50ef28d66090394d934 Mon Sep 17 00:00:00 2001
From: Lorenzo Toniazzi <lorenzo.toniazzi@stepstone.com>
Date: Sun, 7 Jul 2024 10:32:53 +0100
Subject: [PATCH 11/14] Transpose after setting data

---
 llama.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 31baf6c6cccca..adfbb4828ccb3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -328,12 +328,6 @@ static struct lora_data * load_lora(struct lora_info * info) {
         file.seek((0-file.tell()) & 31, SEEK_CUR);
         size_t offset = file.tell();
         struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
-        // Transpose lora matrix A
-        if (std::string(name_buf.data()).find("loraA") != std::string::npos) {
-            tensor = ggml_cont(result->ctx,
-                ggml_transpose(result->ctx, tensor)
-            );
-        }
         ggml_set_name(tensor, name_buf.data());
         size_t nbytes     = ggml_nbytes(tensor);
         size_t nbytes_pad = ggml_nbytes_pad(tensor);
@@ -360,6 +354,14 @@ static struct lora_data * load_lora(struct lora_info * info) {
         read_buf.resize(nbytes);
         file.read_raw(read_buf.data(), nbytes);
         ggml_backend_tensor_set(tensor, read_buf.data(), 0, nbytes);
+        // Transpose lora matrix A
+        std::string original_name(tensor->name);
+        if (std::string(tensor->name).find(".loraA") != std::string::npos) {
+            tensor = ggml_cont(result->ctx,
+                ggml_transpose(result->ctx, tensor)
+            );
+            ggml_set_name(tensor, original_name.c_str());
+        }
     }
     return result;
 }

From 6597a72c1d09544861803ebb5d1fd6265066fe0d Mon Sep 17 00:00:00 2001
From: Lorenzo Toniazzi <lorenzo.toniazzi@stepstone.com>
Date: Sun, 7 Jul 2024 22:09:30 +0100
Subject: [PATCH 12/14] Remove files

---
 _BRANCH_SETUP.md       | 348 -----------------------------------------
 data/hot-lora.txt      |   2 +-
 examples/main/main.cpp |   2 -
 llama.cpp              |  21 +--
 4 files changed, 5 insertions(+), 368 deletions(-)
 delete mode 100644 _BRANCH_SETUP.md

diff --git a/_BRANCH_SETUP.md b/_BRANCH_SETUP.md
deleted file mode 100644
index 7cdb8ae6a67e2..0000000000000
--- a/_BRANCH_SETUP.md
+++ /dev/null
@@ -1,348 +0,0 @@
-# Setup this branch
-
-## Create a lora adpter bin file
-
-0. `mkdir models/open-llama` and download [Open-llama  (all files)](https://huggingface.co/openlm-research/open_llama_3b_v2/tree/main) in the folder `./models/open-llama`
-
-2. `mkdir data && touch data/hot-lora.txt` and write a couple of words in it.
-
-3. Run:
-    ```bash
-    # Convert base model to gguf
-    python3 convert-hf-to-gguf.py models/open-llama/
-    # Quantize base model
-    ./quantize ./models/open-llama/ggml-model-f16.gguf ./models/open-llama/ggml-model-q8_0.gguf Q8_0
-    # Obtain Lora adapter
-    ./finetune  --model-base models/open-llama/ggml-model-q8_0.gguf \
-    --checkpoint-in models/open-llama/chk-lora-ggml-model-q8_0-hot-lora-LATEST.gguf \
-    --checkpoint-out models/open-llama/chk-lora-ggml-model-q8_0-hot-lora-ITERATION.gguf \
-    --lora-out models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \
-    --train-data "data/hot-lora.txt" \
-    --save-every 1 \
-    --threads 1 \
-    --adam-iter 1 \
-    --batch 1 \
-    --ctx 16 \
-    --use-checkpointing
-    ```
-
-## Run main with adapter
-
-Run main with base model and lora adapter to hot-swap
-```bash
-./main -m ./models/open-llama/ggml-model-f16.gguf \
---hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \
--ngl 99 \
--n 128
-```
-```bash
-./main -m ./models/open-llama/ggml-model-f16.gguf \
--ngl 99 \
--n 128
-```
-
-# Logic
-
-
-
-
-# Current status
-
-- Only one Lora adapter can be passed. 
-- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers)
-- GPU not supported
-
-
-
-
-# Tutorial
-
-```cpp
-#include "llama.h"
-
-#include "unicode.h"
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-
-#ifdef GGML_USE_RPC
-#  include "ggml-rpc.h"
-#endif
-
-#ifdef GGML_USE_CUDA
-#  include "ggml-cuda.h"
-#elif defined(GGML_USE_VULKAN)
-#  include "ggml-vulkan.h"
-#elif defined(GGML_USE_SYCL)
-#  include "ggml-sycl.h"
-#elif defined(GGML_USE_KOMPUTE)
-#   include "ggml-kompute.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#  include "ggml-metal.h"
-#endif
-
-// TODO: replace with ggml API call
-#define QK_K 256
-
-#ifdef __has_include
-    #if __has_include(<unistd.h>)
-        #include <unistd.h>
-        #if defined(_POSIX_MAPPED_FILES)
-            #include <sys/mman.h>
-            #include <fcntl.h>
-        #endif
-        #if defined(_POSIX_MEMLOCK_RANGE)
-            #include <sys/resource.h>
-        #endif
-    #endif
-#endif
-
-#if defined(_WIN32)
-    #define WIN32_LEAN_AND_MEAN
-    #ifndef NOMINMAX
-        #define NOMINMAX
-    #endif
-    #include <windows.h>
-    #ifndef PATH_MAX
-        #define PATH_MAX MAX_PATH
-    #endif
-    #include <io.h>
-#endif
-
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <cctype>
-#include <cfloat>
-#include <cinttypes>
-#include <climits>
-#include <cmath>
-#include <cstdarg>
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <forward_list>
-#include <fstream>
-#include <functional>
-#include <future>
-#include <initializer_list>
-#include <locale>
-#include <map>
-#include <memory>
-#include <mutex>
-#include <numeric>
-#include <queue>
-#include <random>
-#include <regex>
-#include <set>
-#include <sstream>
-#include <thread>
-#include <type_traits>
-#include <unordered_map>
-#include "ggml-metal.h"
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#else
-#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#endif
-#else
-#define LLAMA_ATTRIBUTE_FORMAT(...)
-#endif
-
-#define LLAMA_MAX_NODES   8192
-#define LLAMA_MAX_EXPERTS 160
-
-  
-int main() {
-    struct ggml_init_params params = {
-        .mem_size   = 16*1024*1024,
-        .mem_buffer = NULL,
-        /*.no_alloc   =*/ true,
-    };
-
-    // The library allows the user to define a certain function using the available tensor operations. This function
-    // definition is represented internally via a computation graph. Each tensor operation in the function definition
-    // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
-    // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
-    // using one of the available optimization algorithms.
-    //
-    // For example, here we define the function: f(x) = a*x^2 + b    
-
-    // memory allocation happens here
-    // Create context allogating memory
-    struct ggml_context * ctx = ggml_init(params);
-
-    struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-
-    ggml_set_param(ctx, x); // x is an input variable
-
-    struct ggml_tensor * a  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-    struct ggml_tensor * b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-    struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
-    struct ggml_tensor * f  = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
-
-    struct ggml_cgraph * gf = ggml_new_graph(ctx);
-
-    // ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
-    // ggml_backend_alloc_ctx_tensors_from_buft(ctx,  ggml_backend_metal_buffer_type());
-    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
-            if (buf == nullptr) {
-                throw std::runtime_error("unable to allocate backend buffer");
-            }
-    ggml_used_mem(ctx);
-
-    // llama_default_buffer_type_offload(model, layer_gpu); used in llama.cpp
-    // How to check which buffer is the context allocated, 
-    // can look at single tensors? option, check in inited in base model
-
-    // Try this
-    // You can simplify all of this for testing, and if you are using CPU only, and just run with -ngl 0 
-    // and allocate everything in a CPU buffer by using 
-    //  ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
-    // or run with -ngl 99 and use a Metal buffer type instead with 
-    //  ggml_backend_metal_buffer_type()
-    // It will still run if you allocate the tensors in the wrong buffer type as long as you use ggml-backend 
-    // to allocate the tensors, it will just be slower.
-
-    // Notice that the function definition above does not involve any actual computation. The computation is performed only
-    // when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
-
-
-    ggml_build_forward_expand(gf, f);
-
-    // set the input variable and parameter values
-    ggml_set_f32(x, 2.0f);
-    ggml_set_f32(a, 3.0f);
-    ggml_set_f32(b, 4.0f);
-
-    ggml_graph_compute_with_ctx(ctx, gf, 1);
-
-    printf("f = %f\n", ggml_get_f32_1d(f, 0));
-
-    // The actual computation is performed in the ggml_graph_compute() function.
-    //
-    // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
-    // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
-    // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
-    // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
-    // actually needed.
-    //
-    // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
-    // differentiation and optimization algorithms.
-    //
-    // The described approach allows to define the function graph once and then compute its forward or backward graphs
-    // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
-    // the user can avoid the memory allocation overhead at runtime.
-    //
-    // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
-    // citizens, but in theory the library can be extended to support FP8 and integer data types.
-    //
-    // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
-    // and binary operations. Most of the available operations fall into one of these two categories. With time, it became
-    // clear that the library needs to support more complex operations. The way to support these operations is not clear
-    // yet, but a few examples are demonstrated in the following operations:
-    //
-    //   - ggml_permute()
-    //   - ggml_conv_1d_1s()
-    //   - ggml_conv_1d_2s()
-    //
-    // For each tensor operator, the library implements a forward and backward computation function. The forward function
-    // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
-    // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
-    // calculus class, or watch the following video:
-    //
-    //   What is Automatic Differentiation?
-    //   https://www.youtube.com/watch?v=wG_nF1awSSY
-
-    // ## Tensor data (struct ggml_tensor)
-    //
-    // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
-    // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
-    // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
-    
-    struct ggml_tensor * c = ggml_add(ctx, a, b);
-
-    assert(c->src[0] == a);
-    assert(c->src[1] == b);
-
-    // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
-    // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
-    // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
-    // permutation. All tensor operations have to take the stride into account and not assume that the tensor is
-    // contiguous in memory.
-    
-    // The data of the tensor is accessed via the "data" pointer. For example:
-
-    const int nx = 2;
-    const int ny = 3;
-
-    struct ggml_tensor * A = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
-
-    for (int y = 0; y < ny; y++) {
-        for (int x = 0; x < nx; x++) {
-            *(float *) ((char *) A->data + y*A->nb[1] + x*A->nb[0]) = x + y;
-        }
-    }
-
-    //
-    // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
-    //
-
-  }
-  ```
-
-
-
-    ```bash
-    # Convert base model to gguf
-    python3 convert-hf-to-gguf.py models/open-llama/ && \
-    # Quantize base model
-    ./quantize ./models/open-llama/ggml-model-f16.gguf ./models/open-llama/ggml-model-q4.gguf Q4_K && \
-    # Obtain Lora adapter
-    ./finetune  --model-base models/open-llama/ggml-model-q4.gguf \
-    --checkpoint-in models/open-llama/chk-lora-ggml-model-q4-hot-lora-LATEST.gguf \
-    --checkpoint-out models/open-llama/chk-lora-ggml-model-q4-hot-lora-ITERATION.gguf \
-    --lora-out models/open-llama/lora-ggml-model-q4-hot-lora-ITERATION.bin \
-    --train-data "data/hot-lora.txt" \
-    --save-every 1 \
-    --threads 1 \
-    --adam-iter 1 \
-    --batch 1 \
-    --ctx 16 \
-    --use-checkpointing
-    ```
-
-</details>
-
-## 1. Run main with adapter
-
-- Run main with base model and lora adapter to hot-swap
-    ```bash
-  ./main -m ./models/open-llama/ggml-model-q4.gguf \
-  --hot-lora models/open-llama/lora-ggml-model-q4-hot-lora-LATEST.bin \
-  -ngl 99 \
-  -n 128
-  ```
-
-- Do not pass the flag `--hot-lora` and the adapter is ignored:
-  ```bash
-  ./main -m ./models/open-llama/ggml-model-q4.gguf \
-  -ngl 99 \
-  -n 128
-  ```
-
-build for debug:
-```bash
-  make clean && make -j 8 LLAMA_DEBUG=1
-```
\ No newline at end of file
diff --git a/data/hot-lora.txt b/data/hot-lora.txt
index c43186710e906..e88891d2f5eaf 100644
--- a/data/hot-lora.txt
+++ b/data/hot-lora.txt
@@ -1,2 +1,2 @@
 
- how are you?
+test data to train adapter
\ No newline at end of file
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index ba76a496b5999..b97b7b7937f02 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -117,9 +117,7 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
     LOG_TEE("%s", text);
 }
 
-
 int main(int argc, char ** argv) {
-
     gpt_params params;
     g_params = &params;
 
diff --git a/llama.cpp b/llama.cpp
index adfbb4828ccb3..8a5a71c77d84a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2544,7 +2544,6 @@ struct llama_context {
     }
 
     llama_cparams cparams;
-    bool lora_loaded = false;
     std::map<std::string, lora_weights> lora_weights_map; // only one LoRA adapter at the moment
     lora_data llora_data;
     float lora_scale = 1.0f;
@@ -16309,7 +16308,7 @@ void llama_free_model(struct llama_model * model) {
 }
 
 
-static std::map<std::string, lora_weights> get_lora_weights_map_cpp(struct ggml_context* ctx) {
+static std::map<std::string, lora_weights> get_lora_weights_map(struct ggml_context* ctx) {
     struct lora_tensor_pair* pair = build_lora_weights_map(ctx);
     std::map<std::string, lora_weights> map;
 
@@ -16370,7 +16369,7 @@ struct llama_context * llama_new_context_with_model(
             
         lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras?
         lora_params->lora.push_back(lora);
-        // load all loras
+        // load all loras (only 1 supported here)
         std::vector<struct lora_data *> loras;
         for (size_t i = 0; i < lora_params->lora.size(); ++i) {
             struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
@@ -16381,22 +16380,10 @@ struct llama_context * llama_new_context_with_model(
         if (loras.size() == 0) {
             fprintf(stderr, "warning: no lora adapters will be applied.\n");
         }
-        // Assign data 
+        // Assign data and get mapping (index 0 as only 1 lora is supoprted now)
         ctx->llora_data = *loras[0];
-        ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
-        // std::vector<std::string> keys;
-        // for (const auto& pair : ctx->lora_weights_map) {
-        //     keys.push_back(pair.first);
-
-        //     ggml_tensor * tensorA = pair.second.loraA;
-        //     ggml_tensor * tensorB = pair.second.loraB;
-
-        //     ggml_tensor * tensorA_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorA->type, 4, tensorA->ne);
-        //     ggml_tensor * tensorB_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorB->type, 4, tensorB->ne);
-
-        // }
+        ctx->lora_weights_map = get_lora_weights_map((ctx->llora_data).ctx);
     }
-
     /// LORA load end
 
     const auto & hparams = model->hparams;

From e481eb55599ea4cfb452b9d6589b5bfbeb2574bc Mon Sep 17 00:00:00 2001
From: Lorenzo Toniazzi <lorenzo.toniazzi@stepstone.com>
Date: Mon, 8 Jul 2024 08:41:03 +0100
Subject: [PATCH 13/14] renames

---
 llama.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 8a5a71c77d84a..ba6650ccac75e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2545,7 +2545,7 @@ struct llama_context {
 
     llama_cparams cparams;
     std::map<std::string, lora_weights> lora_weights_map; // only one LoRA adapter at the moment
-    lora_data llora_data;
+    lora_data llama_lora_data;
     float lora_scale = 1.0f;
 
     std::vector<ggml_backend_t> backends;
@@ -7699,21 +7699,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = lora_mul_mat(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = ggml_mul_mat_lora(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = lora_mul_mat(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = ggml_mul_mat_lora(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = lora_mul_mat(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = ggml_mul_mat_lora(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -9722,7 +9722,7 @@ struct llm_build_context {
         return gf;
     }
 
-    static ggml_tensor * lora_mul_mat(
+    static ggml_tensor * ggml_mul_mat_lora(
         llama_context & lctx, 
         ggml_context * ctx0, 
         ggml_tensor * weight, 
@@ -16372,17 +16372,17 @@ struct llama_context * llama_new_context_with_model(
         // load all loras (only 1 supported here)
         std::vector<struct lora_data *> loras;
         for (size_t i = 0; i < lora_params->lora.size(); ++i) {
-            struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
-            if (llora_data != NULL) {
-                loras.push_back(llora_data);
+            struct lora_data * llama_lora_data = load_lora(&lora_params->lora[i]);
+            if (llama_lora_data != NULL) {
+                loras.push_back(llama_lora_data);
             }
         }
         if (loras.size() == 0) {
             fprintf(stderr, "warning: no lora adapters will be applied.\n");
         }
         // Assign data and get mapping (index 0 as only 1 lora is supoprted now)
-        ctx->llora_data = *loras[0];
-        ctx->lora_weights_map = get_lora_weights_map((ctx->llora_data).ctx);
+        ctx->llama_lora_data = *loras[0];
+        ctx->lora_weights_map = get_lora_weights_map((ctx->llama_lora_data).ctx);
     }
     /// LORA load end
 

From 9d5089b5bfd15d055e16aad1d925052748102f86 Mon Sep 17 00:00:00 2001
From: Lorenzo Toniazzi <lorenzo.toniazzi@stepstone.com>
Date: Mon, 8 Jul 2024 14:36:27 +0100
Subject: [PATCH 14/14] Add ff lora matmuls

---
 ggml.c    |  1 +
 llama.cpp | 81 +++++++++++++++++++++++++++++--------------------------
 2 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/ggml.c b/ggml.c
index 0fb8dafbd2ab5..a9cb2bc73b48e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5331,6 +5331,7 @@ struct ggml_tensor * ggml_group_norm_inplace(
     return ggml_group_norm_impl(ctx, a, n_groups, true);
 }
 
+
 // ggml_mul_mat
 
 struct ggml_tensor * ggml_mul_mat(
diff --git a/llama.cpp b/llama.cpp
index ba6650ccac75e..986dae59cc07e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -121,6 +121,7 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
 
 ///////// LORA
 
+
 struct lora_weights {
     ggml_tensor* loraA;
     ggml_tensor* loraB;
@@ -2622,6 +2623,37 @@ struct llama_context {
     struct llama_control_vector cvec;
 };
 
+
+
+static ggml_tensor * ggml_mul_mat_lora(
+        llama_context * lctx, 
+        ggml_context * ctx0, 
+        ggml_tensor * weight, 
+        ggml_tensor * cur) {
+    ggml_tensor * mm = ggml_mul_mat(ctx0, weight, cur);
+
+    auto it = lctx->lora_weights_map.find(weight->name);
+    if (it == lctx->lora_weights_map.end()) {
+        return mm;
+    }
+
+    ggml_tensor * loraA = it->second.loraA;
+    ggml_tensor * loraB = it->second.loraB;
+
+    ggml_tensor * t_lora = ggml_mul_mat(ctx0,
+                loraB,
+                ggml_mul_mat(ctx0, loraA, cur)
+            );
+
+    if (lctx->lora_scale != 1.0f) {
+        t_lora = ggml_scale(ctx0, t_lora, lctx->lora_scale);
+    }
+
+    ggml_tensor * t_patch = ggml_add(ctx0, mm, t_lora);
+    return t_patch;
+
+}
+
 static size_t llama_get_device_count(const llama_model & model) {
     size_t count = 1;
 #if defined(GGML_USE_CUDA)
@@ -7022,8 +7054,9 @@ static struct ggml_tensor * llm_build_ffn(
             llm_ffn_op_type   type_op,
           llm_ffn_gate_type   type_gate,
          const llm_build_cb & cb,
-                        int   il) {
-    struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
+                        int   il,
+       struct llama_context * lctx = nullptr) {
+    struct ggml_tensor * tmp = up ? ggml_mul_mat_lora(lctx, ctx, up, cur) : cur;
     cb(tmp, "ffn_up", il);
 
     if (up_b) {
@@ -7035,12 +7068,12 @@ static struct ggml_tensor * llm_build_ffn(
         switch (type_gate) {
             case LLM_FFN_SEQ:
                 {
-                    cur = ggml_mul_mat(ctx, gate, tmp);
+                    cur = ggml_mul_mat_lora(lctx, ctx, gate, tmp);
                     cb(cur, "ffn_gate", il);
                 } break;
             case LLM_FFN_PAR:
                 {
-                    cur = ggml_mul_mat(ctx, gate, cur);
+                    cur = ggml_mul_mat_lora(lctx, ctx, gate, cur);
                     cb(cur, "ffn_gate", il);
                 } break;
         }
@@ -7088,7 +7121,7 @@ static struct ggml_tensor * llm_build_ffn(
         cb(cur, "ffn_gate_par", il);
     }
 
-    cur = ggml_mul_mat(ctx, down, cur);
+    cur = ggml_mul_mat_lora(lctx, ctx, down, cur);
     if (down_b) {
         cb(cur, "ffn_down", il);
     }
@@ -7699,21 +7732,21 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat_lora(lctx, ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = ggml_mul_mat_lora(&lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
-                struct ggml_tensor * Kcur = ggml_mul_mat_lora(lctx, ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = ggml_mul_mat_lora(&lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
-                struct ggml_tensor * Vcur = ggml_mul_mat_lora(lctx, ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = ggml_mul_mat_lora(&lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -7762,7 +7795,8 @@ struct llm_build_context {
                         model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                         NULL,
-                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il, 
+                        &lctx);
                 cb(cur, "ffn_out", il);
             } else {
                 // MoE branch
@@ -9722,35 +9756,6 @@ struct llm_build_context {
         return gf;
     }
 
-    static ggml_tensor * ggml_mul_mat_lora(
-        llama_context & lctx, 
-        ggml_context * ctx0, 
-        ggml_tensor * weight, 
-        ggml_tensor * cur) {
-    ggml_tensor * mm = ggml_mul_mat(ctx0, weight, cur);
-
-    auto it = lctx.lora_weights_map.find(weight->name);
-    if (it == lctx.lora_weights_map.end()) {
-        return mm;
-    }
-
-    ggml_tensor * loraA = it->second.loraA;
-    ggml_tensor * loraB = it->second.loraB;
-
-    ggml_tensor * t_lora = ggml_mul_mat(ctx0,
-                loraB,
-                ggml_mul_mat(ctx0, loraA, cur)
-            );
-
-    if (lctx.lora_scale != 1.0f) {
-        t_lora = ggml_scale(ctx0, t_lora, lctx.lora_scale);
-    }
-
-    ggml_tensor * t_patch = ggml_add(ctx0, mm, t_lora);
-    return t_patch;
-
-}
-
     struct ggml_cgraph * build_phi3() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);