From 67c5e14d069fba61a424f6d782de2d49bf2a8722 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 6 Jul 2024 02:12:53 +0200 Subject: [PATCH 1/8] lora: load to devide buft --- common/common.cpp | 10 +- include/llama.h | 13 +- src/llama.cpp | 411 ++++++++++++++++++---------------------------- 3 files changed, 166 insertions(+), 268 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index c548bcb2857a8..d3eec6aa783b3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2063,14 +2063,8 @@ std::tuple llama_init_from_gpt_par for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); - int err = llama_model_apply_lora_from_file(model, - lora_adapter.c_str(), - lora_scale, - ((i > 0) || params.lora_base.empty()) - ? NULL - : params.lora_base.c_str(), - params.n_threads); - if (err != 0) { + auto adapter = llama_lora_adapter_init(lctx, lora_adapter.c_str()); + if (adapter == nullptr) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); llama_free_model(model); diff --git a/include/llama.h b/include/llama.h index 865ace9944d02..077d902837c49 100644 --- a/include/llama.h +++ b/include/llama.h @@ -406,6 +406,9 @@ extern "C" { const char * content; } llama_chat_message; + // lora adapter + struct llama_lora_adapter; + // Helpers for getting default parameters LLAMA_API struct llama_model_params llama_model_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void); @@ -510,13 +513,9 @@ extern "C" { // the layers modified by the adapter. Can be NULL to use the current loaded model. // The model needs to be reloaded before applying a new adapter, otherwise the adapter // will be applied on top of the previous one - // Returns 0 on success - LLAMA_API int32_t llama_model_apply_lora_from_file( - const struct llama_model * model, - const char * path_lora, - float scale, - const char * path_base_model, - int32_t n_threads); + LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( + struct llama_context * ctx, + const char * path_lora); // Apply a loaded control vector to a llama_context, or if data is NULL, clear // the currently loaded vector. diff --git a/src/llama.cpp b/src/llama.cpp index b770ca5bc33fc..ec89b2778ea08 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2547,6 +2547,29 @@ struct llama_control_vector { } }; +struct lora_weight { + struct ggml_tensor * a = nullptr; + struct ggml_tensor * b = nullptr; + lora_weight() {} + lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} +}; + +struct llama_lora_adapter { + // map tensor name to lora_a_b + std::map ab_map; + std::vector ctxs; + std::vector bufs; + + ~llama_lora_adapter() { + for (struct ggml_context * ctx : ctxs) { + ggml_free(ctx); + } + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } + } +}; + struct llama_vocab { using id = int32_t; using token = std::string; @@ -2704,6 +2727,10 @@ struct llama_context { } ggml_backend_buffer_free(buf_output); + + for (auto adapter : lora_adapters) { + delete adapter; + } } llama_cparams cparams; @@ -2795,6 +2822,9 @@ struct llama_context { // control vectors struct llama_control_vector cvec; + + // lora adapters + std::vector lora_adapters; }; static size_t llama_get_device_count(const llama_model & model) { @@ -18243,281 +18273,149 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } -static int llama_apply_lora_from_file_internal( - const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads -) { +static int llama_lora_adapter_init_internal(const struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) { + static const int n_inp_tensors = 5; // see llama_model + static const int n_out_tensors = 5; // see llama_model LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); - const int64_t t_start_lora_us = ggml_time_us(); - - llama_file fin(path_lora, "rb"); - - // verify magic and version - { - uint32_t magic = fin.read_u32(); - if (magic != LLAMA_FILE_MAGIC_GGLA) { - LLAMA_LOG_ERROR("%s: bad file magic\n", __func__); - return 1; - } - - uint32_t format_version = fin.read_u32(); - if (format_version != 1) { - LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); - return 1; - } - } - - int32_t lora_r = fin.read_u32(); - int32_t lora_alpha = fin.read_u32(); - float scaling = scale * (float)lora_alpha / (float)lora_r; + // TODO: check lora base model arch - LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); - - // load base model - std::unique_ptr ml; - if (path_base_model) { - LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); - ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr)); - ml->init_mappings(/*prefetch*/ false); // no prefetching - } - - struct tensor_meta { - std::string name; - ggml_type type; - int32_t ne[2]; - size_t offset; + ggml_context * ctx = nullptr; + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ false, + /* .ctx = */ &ctx, }; - std::map tensor_meta_map; - - // load all tensor meta - while (true) { - if (fin.tell() == fin.size) { - // eof - break; - } - - int32_t n_dims; - int32_t name_len; - int32_t ftype; - - fin.read_raw(&n_dims, sizeof(n_dims)); - fin.read_raw(&name_len, sizeof(name_len)); - fin.read_raw(&ftype, sizeof(ftype)); - - if (n_dims != 1 && n_dims != 2) { - LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); - return 1; - } - - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read_raw(&ne[i], sizeof(ne[i])); - } - - std::string name; - { - GGML_ASSERT(name_len < GGML_MAX_NAME); - char buf[GGML_MAX_NAME]; - fin.read_raw(buf, name_len); - name = std::string(buf, name_len); - } - - // check for lora suffix - std::string lora_suffix; - if (name.length() > 6) { - lora_suffix = name.substr(name.length() - 6); - } - if (lora_suffix != ".loraA" && lora_suffix != ".loraB") { - LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); - return 1; - } + struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params); + if (!ctx_gguf) { + LLAMA_LOG_ERROR("%s: failed to load lora adapter file from %s\n", __func__, path_lora); + return -1; + } - // tensor type - ggml_type wtype; - switch (ftype) { - case 0: wtype = GGML_TYPE_F32; break; - case 1: wtype = GGML_TYPE_F16; break; - default: - { - LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n", - __func__, ftype); - return 1; - } + // calculate n_tensors_per_layer + int n_tensors_per_layer = 0; + { + int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); + for (int i = 0; i < n_tensors; i++) { + int il = -1; + sscanf(gguf_get_tensor_name(ctx_gguf, i), "blk.%d.", &il); + if (il == 0) n_tensors_per_layer++; } - - // data offset - size_t offset = fin.tell(); - offset = (offset + 31) & -32; - - // skip tensor data - fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET); - - tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset }); } + printf("n_tensors_per_layer %d\n", n_tensors_per_layer); - bool warned = false; - int n_tensors = 0; - - // apply - ggml_backend_t backend_cpu = ggml_backend_cpu_init(); - if (backend_cpu == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__); - return 1; + // count layer buffer types + std::map buft_layer_count; + for (int64_t i = 0; i < model.hparams.n_layer; i++) { + buft_layer_count[model.buft_layer[i].buft]++; } - ggml_backend_cpu_set_n_threads(backend_cpu, n_threads); - - std::vector> read_buf; - for (const auto & it : model.tensors_by_name) { - const std::string & base_name = it.first; - ggml_tensor * model_t = it.second; - - if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() || - tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) { - continue; - } - - tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA"); - tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB"); - ggml_init_params lora_init_params = { - /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), - /* .mem_buffer */ nullptr, - /* .no_alloc */ true, + // allocate contexts + std::map ctx_map; + { + auto new_ggml_ctx = [](size_t n_tensors) { + struct ggml_init_params params = { + /*.mem_size =*/ n_tensors*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + return ggml_init(params); }; - ggml_context * lora_ctx = ggml_init(lora_init_params); - if (lora_ctx == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__); - ggml_backend_free(backend_cpu); - return 1; + for (auto & it : buft_layer_count) { + int n_layers = it.second; + printf("buf %p layers %d\n", it.first, it.second); + ctx_map[it.first] = new_ggml_ctx(2*n_layers*n_tensors_per_layer); } + //ctx_map[model.buft_input.buft] = new_ggml_ctx(2*n_inp_tensors); + //ctx_map[model.buft_output.buft] = new_ggml_ctx(2*n_out_tensors); + } - // create tensors - ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]); - ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]); - ggml_set_name(loraA, metaA.name.c_str()); - ggml_set_name(loraB, metaB.name.c_str()); - - ggml_tensor * base_t; - if (ml) { - if (!ml->get_tensor_meta(base_name.c_str())) { - LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); - return 1; + // bundle lora_a and lora_b into pairs + std::map ab_map; + auto str_endswith = [](const std::string & str, const std::string & suffix) { + return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; + }; + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string name(cur->name); + if (str_endswith(name, ".lora_a")) { + replace_all(name, ".lora_a", ""); + if (ab_map.find(name) == ab_map.end()) { + ab_map[name] = lora_weight(cur, nullptr); + } else { + ab_map[name].a = cur; + } + } else if (str_endswith(name, ".lora_b")) { + replace_all(name, ".lora_b", ""); + if (ab_map.find(name) == ab_map.end()) { + ab_map[name] = lora_weight(nullptr, cur); + } else { + ab_map[name].b = cur; } - base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str())); - } else { - base_t = ggml_dup_tensor(lora_ctx, model_t); - } - ggml_set_name(base_t, base_name.c_str()); - - // allocate in backend buffer - ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); - if (lora_buf == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__); - return 1; } + } - // load tensor data - auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) { - read_buf.resize(ggml_nbytes(tensor)); - fin.seek(tensor_meta.offset, SEEK_SET); - fin.read_raw(read_buf.data(), ggml_nbytes(tensor)); - ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size()); - }; - load_tensor(metaA, loraA); - load_tensor(metaB, loraB); - - // load base model tensor data - if (ml) { - ml->load_data_for(base_t); + // add tensors + for (auto & it : ab_map) { + std::string name = it.first; + lora_weight & w = it.second; + GGML_ASSERT(w.a != nullptr); + GGML_ASSERT(w.b != nullptr); + int il = -1; + sscanf(name.c_str(), "blk.%d.", &il); + if (il >= 0) { + printf("%s %p %p\n", name.c_str(), w.a, w.b); + struct ggml_context * dev_ctx = ctx_map.at(model.buft_layer[il].buft); + struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); + struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); + ggml_set_name(tensor_a, w.a->name); + ggml_set_name(tensor_b, w.b->name); + adapter.ab_map[name] = lora_weight(tensor_a, tensor_b); } else { - ggml_backend_tensor_copy(model_t, base_t); - } - - if (ggml_is_quantized(base_t->type) && !warned) { - LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " - "use a f16 or f32 base model with --lora-base\n", __func__); - warned = true; + // TODO: process output & token_embd tensors } + } - if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { - LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" - " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); - ggml_free(lora_ctx); - ggml_backend_buffer_free(lora_buf); - ggml_backend_free(backend_cpu); - return 1; - } - - auto build_lora_graph = [&]() { - // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); - ggml_set_name(BA, "BA"); - - if (scaling != 1.0f) { - BA = ggml_scale(lora_ctx, BA, scaling); - ggml_set_name(BA, "BA_scaled"); - } - - ggml_tensor * r; - r = ggml_add_inplace(lora_ctx, base_t, BA); - ggml_set_name(r, "r_add"); - - if (base_t->type != model_t->type) { - // convert the result to the model type - r = ggml_cast(lora_ctx, r, model_t->type); - ggml_set_name(r, "r_cast"); + // allocate tensors / buffers and zero + { + adapter.ctxs.reserve(ctx_map.size()); + adapter.bufs.reserve(ctx_map.size()); + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora adapter\n", __func__); + return -1; } - - return r; - }; - - ggml_cgraph * gf = ggml_new_graph(lora_ctx); - ggml_tensor * r = build_lora_graph(); - ggml_build_forward_expand(gf, r); - - ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); - if (graph_buf == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__); - ggml_free(lora_ctx); - ggml_backend_buffer_free(lora_buf); - ggml_backend_free(backend_cpu); - return 1; + ggml_backend_buffer_clear(buf, 0); + adapter.ctxs.push_back(ctx); + adapter.bufs.push_back(buf); } + } - ggml_backend_graph_compute(backend_cpu, gf); - - ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); - -#if 0 - // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU - //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE); - - // sched compute - ggml_build_forward_expand(gf, build_graph()); - ggml_backend_sched_init_measure(sched, gf); - - // create the graph again, since the previous one was destroyed by the measure - ggml_graph_clear(gf); - ggml_build_forward_expand(gf, build_graph()); - ggml_backend_sched_graph_compute(sched, gf); - ggml_backend_sched_free(sched); -#endif - - ggml_backend_buffer_free(lora_buf); - ggml_backend_buffer_free(graph_buf); - ggml_free(lora_ctx); - - n_tensors++; - if (n_tensors % 4 == 0) { - LLAMA_LOG_INFO("."); + // set tensor data + { + llama_file gguf_file(path_lora, "rb"); + std::vector read_buf; + auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { + size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name)); + size_t size = ggml_nbytes(orig); + if (read_buf.size() < size) { + read_buf.resize(size); + } + gguf_file.read_raw(read_buf.data(), size); + printf("%s: %s size=%ld\n", __func__, orig->name, size); + return ggml_backend_tensor_set(dev, read_buf.data(), 0, size); + }; + for (auto & it : adapter.ab_map) { + auto orig = ab_map[it.first]; + auto dev = it.second; + set_tensor(orig.a, dev.a); + set_tensor(orig.b, dev.b); } } - ggml_backend_free(backend_cpu); - - const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; - LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); - + // free ctx for reading gguf + ggml_free(ctx); return 0; } @@ -19298,12 +19196,19 @@ uint32_t llama_model_quantize( } } -int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) { +struct llama_lora_adapter * llama_lora_adapter_init(struct llama_context * ctx, const char * path_lora) { try { - return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads); + struct llama_lora_adapter * adapter = new llama_lora_adapter; + int res = llama_lora_adapter_init_internal(ctx->model, path_lora, *adapter); + if (res == 0) { + ctx->lora_adapters.push_back(adapter); + return adapter; + } else { + return nullptr; + } } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); - return 1; + return nullptr; } } From e9d7b6c05f928665cb9779629816128b8016418d Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 6 Jul 2024 12:07:29 +0200 Subject: [PATCH 2/8] add patch tensor function --- src/llama.cpp | 211 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 193 insertions(+), 18 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index ec89b2778ea08..d97eb3bb2fc63 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2702,6 +2702,10 @@ struct llama_model { int64_t t_load_us = 0; int64_t t_start_us = 0; + // used by lora, to save model's original tensors + std::vector orig_tensors; + std::vector orig_layers; + ~llama_model() { for (struct ggml_context * ctx : ctxs) { ggml_free(ctx); @@ -13491,6 +13495,10 @@ static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) { return result; } +// forward declaration +static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build); +static int32_t llama_lora_restore_tensors(struct llama_context & lctx); + static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch, @@ -13534,6 +13542,11 @@ static struct ggml_cgraph * llama_build_graph( llm.init(); + if (!lctx.lora_adapters.empty()) { + llama_lora_restore_tensors(lctx); + llama_lora_patch_tensors(lctx, llm.ctx0); + } + switch (model.arch) { case LLM_ARCH_LLAMA: { @@ -18304,10 +18317,12 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co printf("n_tensors_per_layer %d\n", n_tensors_per_layer); // count layer buffer types - std::map buft_layer_count; + std::map buft_tensor_count; for (int64_t i = 0; i < model.hparams.n_layer; i++) { - buft_layer_count[model.buft_layer[i].buft]++; + buft_tensor_count[model.buft_layer[i].buft] += n_tensors_per_layer; } + buft_tensor_count[model.buft_input.buft] += n_inp_tensors; + buft_tensor_count[model.buft_output.buft] += n_out_tensors; // allocate contexts std::map ctx_map; @@ -18320,13 +18335,11 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co }; return ggml_init(params); }; - for (auto & it : buft_layer_count) { - int n_layers = it.second; - printf("buf %p layers %d\n", it.first, it.second); - ctx_map[it.first] = new_ggml_ctx(2*n_layers*n_tensors_per_layer); + for (auto & it : buft_tensor_count) { + int n_tensors = it.second; + // LLAMA_LOG_INFO("buf %p layers %d\n", it.first, it.second); + ctx_map[it.first] = new_ggml_ctx(2*n_tensors); // for a+b tensors } - //ctx_map[model.buft_input.buft] = new_ggml_ctx(2*n_inp_tensors); - //ctx_map[model.buft_output.buft] = new_ggml_ctx(2*n_out_tensors); } // bundle lora_a and lora_b into pairs @@ -18356,22 +18369,29 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co // add tensors for (auto & it : ab_map) { std::string name = it.first; + const char * cname = name.c_str(); lora_weight & w = it.second; GGML_ASSERT(w.a != nullptr); GGML_ASSERT(w.b != nullptr); int il = -1; - sscanf(name.c_str(), "blk.%d.", &il); + sscanf(cname, "blk.%d.", &il); + struct ggml_context * dev_ctx; // device ctx if (il >= 0) { - printf("%s %p %p\n", name.c_str(), w.a, w.b); - struct ggml_context * dev_ctx = ctx_map.at(model.buft_layer[il].buft); - struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); - struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); - ggml_set_name(tensor_a, w.a->name); - ggml_set_name(tensor_b, w.b->name); - adapter.ab_map[name] = lora_weight(tensor_a, tensor_b); + dev_ctx = ctx_map.at(model.buft_layer[il].buft); + } else if (strstr(cname, "tok") == 0) { + dev_ctx = ctx_map.at(model.buft_input.buft); + } else if (strstr(cname, "output") == 0) { + dev_ctx = ctx_map.at(model.buft_output.buft); } else { - // TODO: process output & token_embd tensors + LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cname); + continue; } + // LLAMA_LOG_INFO("%s %p %p\n", cname, w.a, w.b); + struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); + struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); + ggml_set_name(tensor_a, w.a->name); + ggml_set_name(tensor_b, w.b->name); + adapter.ab_map[name] = lora_weight(tensor_a, tensor_b); } // allocate tensors / buffers and zero @@ -18402,8 +18422,9 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co if (read_buf.size() < size) { read_buf.resize(size); } + gguf_file.seek(offs, SEEK_SET); gguf_file.read_raw(read_buf.data(), size); - printf("%s: %s size=%ld\n", __func__, orig->name, size); + // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, orig->name, size); return ggml_backend_tensor_set(dev, read_buf.data(), 0, size); }; for (auto & it : adapter.ab_map) { @@ -18414,11 +18435,165 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co } } + LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2); + // free ctx for reading gguf ggml_free(ctx); return 0; } +static int32_t llama_lora_restore_tensors(struct llama_context & lctx) { + // TODO @ngxson : not ideal, but "const" is discarded to make it work + struct llama_model & model = const_cast(lctx.model); + if (!model.orig_tensors.empty()) { + size_t i = 0; + model.tok_embd = model.orig_tensors[i++]; + model.type_embd = model.orig_tensors[i++]; + model.pos_embd = model.orig_tensors[i++]; + model.tok_norm = model.orig_tensors[i++]; + model.tok_norm_b = model.orig_tensors[i++]; + model.output_norm = model.orig_tensors[i++]; + model.output_norm_b = model.orig_tensors[i++]; + model.output = model.orig_tensors[i++]; + model.output_b = model.orig_tensors[i++]; + model.output_norm_enc = model.orig_tensors[i++]; + for (size_t il = 0; il < model.orig_layers.size(); il++) { + model.layers[il] = model.orig_layers[il]; // copy + } + } +} + +static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build) { + GGML_ASSERT(!lctx.lora_adapters.empty()); + // TODO @ngxson : not ideal, but "const" is discarded to make it work + struct llama_model & model = const_cast(lctx.model); + + // save all original tensors + if (model.orig_tensors.empty()) { + model.orig_tensors.push_back(model.tok_embd); + model.orig_tensors.push_back(model.type_embd); + model.orig_tensors.push_back(model.pos_embd); + model.orig_tensors.push_back(model.tok_norm); + model.orig_tensors.push_back(model.tok_norm_b); + model.orig_tensors.push_back(model.output_norm); + model.orig_tensors.push_back(model.output_norm_b); + model.orig_tensors.push_back(model.output); + model.orig_tensors.push_back(model.output_b); + model.orig_tensors.push_back(model.output_norm_enc); + model.orig_layers.reserve(model.layers.size()); + for (llama_layer layer : model.layers) { + model.orig_layers.push_back(layer); // copy + } + } + + // patch tensors + auto patch_tensor = [&](struct llama_lora_adapter * adapter, struct ggml_tensor ** tensor) { + if (*tensor == nullptr) { + return; + } + std::string name = ggml_get_name(*tensor); + if (adapter->ab_map.find(name) != adapter->ab_map.end()) { + auto lora_w = adapter->ab_map[name]; + struct ggml_tensor * cur = ggml_mul_mat(ctx_build, lora_w.a, lora_w.b); + cur = ggml_add(ctx_build, cur, *tensor); + // TODO: scale + ggml_format_name(cur, "%s.merged", name.c_str()); + // LLAMA_LOG_INFO("LORA %s\n", cur->name); + tensor = &cur; + } + }; + for (auto adapter : lctx.lora_adapters) { + patch_tensor(adapter, &model.tok_embd); + patch_tensor(adapter, &model.type_embd); + patch_tensor(adapter, &model.pos_embd); + patch_tensor(adapter, &model.tok_norm); + patch_tensor(adapter, &model.tok_norm_b); + patch_tensor(adapter, &model.output_norm); + patch_tensor(adapter, &model.output_norm_b); + patch_tensor(adapter, &model.output); + patch_tensor(adapter, &model.output_b); + patch_tensor(adapter, &model.output_norm_enc); + for (llama_layer & layer : model.layers) { + patch_tensor(adapter, &layer.attn_norm); + patch_tensor(adapter, &layer.attn_norm_b); + patch_tensor(adapter, &layer.attn_norm_2); + patch_tensor(adapter, &layer.attn_norm_2_b); + patch_tensor(adapter, &layer.attn_q_norm); + patch_tensor(adapter, &layer.attn_q_norm_b); + patch_tensor(adapter, &layer.attn_k_norm); + patch_tensor(adapter, &layer.attn_k_norm_b); + patch_tensor(adapter, &layer.attn_out_norm); + patch_tensor(adapter, &layer.attn_out_norm_b); + patch_tensor(adapter, &layer.attn_q_a_norm); + patch_tensor(adapter, &layer.attn_kv_a_norm); + patch_tensor(adapter, &layer.attn_sub_norm); + patch_tensor(adapter, &layer.attn_post_norm); + patch_tensor(adapter, &layer.ffn_sub_norm); + patch_tensor(adapter, &layer.attn_norm_cross); + patch_tensor(adapter, &layer.attn_norm_enc); + + patch_tensor(adapter, &layer.wq); + patch_tensor(adapter, &layer.wk); + patch_tensor(adapter, &layer.wv); + patch_tensor(adapter, &layer.wo); + patch_tensor(adapter, &layer.wqkv); + patch_tensor(adapter, &layer.wq_a); + patch_tensor(adapter, &layer.wq_b); + patch_tensor(adapter, &layer.wkv_a_mqa); + patch_tensor(adapter, &layer.wkv_b); + patch_tensor(adapter, &layer.wq_cross); + patch_tensor(adapter, &layer.wk_cross); + patch_tensor(adapter, &layer.wv_cross); + patch_tensor(adapter, &layer.wo_cross); + patch_tensor(adapter, &layer.wq_enc); + patch_tensor(adapter, &layer.wk_enc); + patch_tensor(adapter, &layer.wv_enc); + patch_tensor(adapter, &layer.wo_enc); + + patch_tensor(adapter, &layer.bq); + patch_tensor(adapter, &layer.bk); + patch_tensor(adapter, &layer.bv); + patch_tensor(adapter, &layer.bo); + patch_tensor(adapter, &layer.bqkv); + + patch_tensor(adapter, &layer.attn_rel_b); + patch_tensor(adapter, &layer.attn_rel_b_enc); + patch_tensor(adapter, &layer.attn_rel_b_cross); + + patch_tensor(adapter, &layer.ffn_norm); + patch_tensor(adapter, &layer.ffn_norm_b); + patch_tensor(adapter, &layer.ffn_post_norm); + patch_tensor(adapter, &layer.layer_out_norm); + patch_tensor(adapter, &layer.layer_out_norm_b); + patch_tensor(adapter, &layer.ffn_norm_exps); + patch_tensor(adapter, &layer.ffn_norm_enc); + + patch_tensor(adapter, &layer.ffn_gate); + patch_tensor(adapter, &layer.ffn_down); + patch_tensor(adapter, &layer.ffn_up); + patch_tensor(adapter, &layer.ffn_gate_enc); + patch_tensor(adapter, &layer.ffn_down_enc); + patch_tensor(adapter, &layer.ffn_up_enc); + + patch_tensor(adapter, &layer.ffn_gate_inp); + patch_tensor(adapter, &layer.ffn_gate_exps); + patch_tensor(adapter, &layer.ffn_down_exps); + patch_tensor(adapter, &layer.ffn_up_exps ); + + patch_tensor(adapter, &layer.ffn_gate_inp_shexp); + patch_tensor(adapter, &layer.ffn_gate_shexp); + patch_tensor(adapter, &layer.ffn_down_shexp); + patch_tensor(adapter, &layer.ffn_up_shexp); + + patch_tensor(adapter, &layer.ffn_gate_b); + patch_tensor(adapter, &layer.ffn_down_b); + patch_tensor(adapter, &layer.ffn_up_b); + patch_tensor(adapter, &layer.ffn_act); + } + } + return 0; +} + // // interface implementation // From 4e28ad40a099c7f618abf8ae113c4e56ee7705e8 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 6 Jul 2024 13:29:37 +0200 Subject: [PATCH 3/8] correct tensor patch --- ggml/src/ggml.c | 4 ++-- src/llama.cpp | 33 ++++++++++++--------------------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index bc91ac3a726ab..2093be2a98013 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -19339,7 +19339,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "digraph G {\n"); fprintf(fp, " newrank = true;\n"); - fprintf(fp, " rankdir = LR;\n"); + fprintf(fp, " rankdir = TB;\n"); for (int i = 0; i < gb->n_nodes; i++) { struct ggml_tensor * node = gb->nodes[i]; @@ -19401,7 +19401,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph } fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); - if (ggml_nelements(node) < 5) { + if (ggml_nelements(node) < 5 && node->data != NULL) { fprintf(fp, " | ("); for (int j = 0; j < ggml_nelements(node); j++) { if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { diff --git a/src/llama.cpp b/src/llama.cpp index d97eb3bb2fc63..1c7f6650a9c47 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18314,7 +18314,7 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co if (il == 0) n_tensors_per_layer++; } } - printf("n_tensors_per_layer %d\n", n_tensors_per_layer); + // printf("n_tensors_per_layer %d\n", n_tensors_per_layer); // count layer buffer types std::map buft_tensor_count; @@ -18363,6 +18363,8 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co } else { ab_map[name].b = cur; } + } else { + LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name); } } @@ -18400,14 +18402,14 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co adapter.bufs.reserve(ctx_map.size()); for (auto it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; - ggml_context * ctx = it.second; - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + ggml_context * ctx_dev = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft); if (!buf) { LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora adapter\n", __func__); return -1; } ggml_backend_buffer_clear(buf, 0); - adapter.ctxs.push_back(ctx); + adapter.ctxs.push_back(ctx_dev); adapter.bufs.push_back(buf); } } @@ -18424,8 +18426,8 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co } gguf_file.seek(offs, SEEK_SET); gguf_file.read_raw(read_buf.data(), size); - // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, orig->name, size); - return ggml_backend_tensor_set(dev, read_buf.data(), 0, size); + // LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, dev->name, size); + ggml_backend_tensor_set(dev, read_buf.data(), 0, size); }; for (auto & it : adapter.ab_map) { auto orig = ab_map[it.first]; @@ -18461,6 +18463,7 @@ static int32_t llama_lora_restore_tensors(struct llama_context & lctx) { model.layers[il] = model.orig_layers[il]; // copy } } + return 0; } static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build) { @@ -18498,8 +18501,8 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml cur = ggml_add(ctx_build, cur, *tensor); // TODO: scale ggml_format_name(cur, "%s.merged", name.c_str()); - // LLAMA_LOG_INFO("LORA %s\n", cur->name); - tensor = &cur; + // LLAMA_LOG_INFO("LORA %p %s\n", cur, cur->name); + *tensor = cur; } }; for (auto adapter : lctx.lora_adapters) { @@ -18541,14 +18544,6 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml patch_tensor(adapter, &layer.wq_b); patch_tensor(adapter, &layer.wkv_a_mqa); patch_tensor(adapter, &layer.wkv_b); - patch_tensor(adapter, &layer.wq_cross); - patch_tensor(adapter, &layer.wk_cross); - patch_tensor(adapter, &layer.wv_cross); - patch_tensor(adapter, &layer.wo_cross); - patch_tensor(adapter, &layer.wq_enc); - patch_tensor(adapter, &layer.wk_enc); - patch_tensor(adapter, &layer.wv_enc); - patch_tensor(adapter, &layer.wo_enc); patch_tensor(adapter, &layer.bq); patch_tensor(adapter, &layer.bk); @@ -18556,10 +18551,6 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml patch_tensor(adapter, &layer.bo); patch_tensor(adapter, &layer.bqkv); - patch_tensor(adapter, &layer.attn_rel_b); - patch_tensor(adapter, &layer.attn_rel_b_enc); - patch_tensor(adapter, &layer.attn_rel_b_cross); - patch_tensor(adapter, &layer.ffn_norm); patch_tensor(adapter, &layer.ffn_norm_b); patch_tensor(adapter, &layer.ffn_post_norm); @@ -18578,7 +18569,7 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml patch_tensor(adapter, &layer.ffn_gate_inp); patch_tensor(adapter, &layer.ffn_gate_exps); patch_tensor(adapter, &layer.ffn_down_exps); - patch_tensor(adapter, &layer.ffn_up_exps ); + patch_tensor(adapter, &layer.ffn_up_exps); patch_tensor(adapter, &layer.ffn_gate_inp_shexp); patch_tensor(adapter, &layer.ffn_gate_shexp); From 1b4ffbac4720cd9bee0bc0422df927a1ff1dc22f Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 6 Jul 2024 14:24:56 +0200 Subject: [PATCH 4/8] llama_lora_adapter_apply --- common/common.cpp | 3 +- ggml/src/ggml.c | 2 +- include/llama.h | 6 +- src/llama.cpp | 253 +++++++++++++++++++++------------------------- 4 files changed, 122 insertions(+), 142 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index d3eec6aa783b3..d5dd4d38d3cf0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2063,13 +2063,14 @@ std::tuple llama_init_from_gpt_par for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); - auto adapter = llama_lora_adapter_init(lctx, lora_adapter.c_str()); + auto adapter = llama_lora_adapter_init(lctx, lora_adapter.c_str(), lora_scale); if (adapter == nullptr) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); llama_free_model(model); return std::make_tuple(nullptr, nullptr); } + llama_lora_adapter_apply(lctx, adapter); } if (params.ignore_eos) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2093be2a98013..2e09b7087e667 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -19339,7 +19339,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "digraph G {\n"); fprintf(fp, " newrank = true;\n"); - fprintf(fp, " rankdir = TB;\n"); + fprintf(fp, " rankdir = LR;\n"); for (int i = 0; i < gb->n_nodes; i++) { struct ggml_tensor * node = gb->nodes[i]; diff --git a/include/llama.h b/include/llama.h index 077d902837c49..50ea0d84773bf 100644 --- a/include/llama.h +++ b/include/llama.h @@ -515,7 +515,11 @@ extern "C" { // will be applied on top of the previous one LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( struct llama_context * ctx, - const char * path_lora); + const char * path_lora, + float scale); + LLAMA_API int32_t llama_lora_adapter_apply( + struct llama_context * ctx, + struct llama_lora_adapter * adapter); // Apply a loaded control vector to a llama_context, or if data is NULL, clear // the currently loaded vector. diff --git a/src/llama.cpp b/src/llama.cpp index 1c7f6650a9c47..de3d77485c0c2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2559,6 +2559,7 @@ struct llama_lora_adapter { std::map ab_map; std::vector ctxs; std::vector bufs; + float scale = 1.0f; ~llama_lora_adapter() { for (struct ggml_context * ctx : ctxs) { @@ -13495,10 +13496,6 @@ static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) { return result; } -// forward declaration -static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build); -static int32_t llama_lora_restore_tensors(struct llama_context & lctx); - static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch, @@ -13542,11 +13539,6 @@ static struct ggml_cgraph * llama_build_graph( llm.init(); - if (!lctx.lora_adapters.empty()) { - llama_lora_restore_tensors(lctx); - llama_lora_patch_tensors(lctx, llm.ctx0); - } - switch (model.arch) { case LLM_ARCH_LLAMA: { @@ -18444,144 +18436,126 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co return 0; } -static int32_t llama_lora_restore_tensors(struct llama_context & lctx) { - // TODO @ngxson : not ideal, but "const" is discarded to make it work - struct llama_model & model = const_cast(lctx.model); - if (!model.orig_tensors.empty()) { - size_t i = 0; - model.tok_embd = model.orig_tensors[i++]; - model.type_embd = model.orig_tensors[i++]; - model.pos_embd = model.orig_tensors[i++]; - model.tok_norm = model.orig_tensors[i++]; - model.tok_norm_b = model.orig_tensors[i++]; - model.output_norm = model.orig_tensors[i++]; - model.output_norm_b = model.orig_tensors[i++]; - model.output = model.orig_tensors[i++]; - model.output_b = model.orig_tensors[i++]; - model.output_norm_enc = model.orig_tensors[i++]; - for (size_t il = 0; il < model.orig_layers.size(); il++) { - model.layers[il] = model.orig_layers[il]; // copy - } - } - return 0; -} +int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_adapter * adapter) { + GGML_ASSERT(!lctx->lora_adapters.empty()); + const struct llama_model & model = lctx->model; + struct ggml_init_params ctx0_params = { + /*.mem_size =*/ lctx->buf_compute_meta.size(), + /*.mem_buffer =*/ lctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx0 = ggml_init(ctx0_params); -static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build) { - GGML_ASSERT(!lctx.lora_adapters.empty()); - // TODO @ngxson : not ideal, but "const" is discarded to make it work - struct llama_model & model = const_cast(lctx.model); - - // save all original tensors - if (model.orig_tensors.empty()) { - model.orig_tensors.push_back(model.tok_embd); - model.orig_tensors.push_back(model.type_embd); - model.orig_tensors.push_back(model.pos_embd); - model.orig_tensors.push_back(model.tok_norm); - model.orig_tensors.push_back(model.tok_norm_b); - model.orig_tensors.push_back(model.output_norm); - model.orig_tensors.push_back(model.output_norm_b); - model.orig_tensors.push_back(model.output); - model.orig_tensors.push_back(model.output_b); - model.orig_tensors.push_back(model.output_norm_enc); - model.orig_layers.reserve(model.layers.size()); - for (llama_layer layer : model.layers) { - model.orig_layers.push_back(layer); // copy - } - } - - // patch tensors - auto patch_tensor = [&](struct llama_lora_adapter * adapter, struct ggml_tensor ** tensor) { - if (*tensor == nullptr) { + // apply lora for model tensors + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + std::vector> output_nodes; + auto apply_lora = [&](struct llama_lora_adapter * adapter, struct ggml_tensor * model_tensor) { + if (model_tensor == nullptr) { return; } - std::string name = ggml_get_name(*tensor); + std::string name = ggml_get_name(model_tensor); if (adapter->ab_map.find(name) != adapter->ab_map.end()) { auto lora_w = adapter->ab_map[name]; - struct ggml_tensor * cur = ggml_mul_mat(ctx_build, lora_w.a, lora_w.b); - cur = ggml_add(ctx_build, cur, *tensor); - // TODO: scale + struct ggml_tensor * cur = ggml_mul_mat(ctx0, lora_w.a, lora_w.b); + cur = ggml_scale_inplace(ctx0, cur, adapter->scale); + cur = ggml_add(ctx0, cur, model_tensor); ggml_format_name(cur, "%s.merged", name.c_str()); - // LLAMA_LOG_INFO("LORA %p %s\n", cur, cur->name); - *tensor = cur; + ggml_build_forward_expand(gf, cur); + output_nodes.push_back({model_tensor, cur}); } }; - for (auto adapter : lctx.lora_adapters) { - patch_tensor(adapter, &model.tok_embd); - patch_tensor(adapter, &model.type_embd); - patch_tensor(adapter, &model.pos_embd); - patch_tensor(adapter, &model.tok_norm); - patch_tensor(adapter, &model.tok_norm_b); - patch_tensor(adapter, &model.output_norm); - patch_tensor(adapter, &model.output_norm_b); - patch_tensor(adapter, &model.output); - patch_tensor(adapter, &model.output_b); - patch_tensor(adapter, &model.output_norm_enc); - for (llama_layer & layer : model.layers) { - patch_tensor(adapter, &layer.attn_norm); - patch_tensor(adapter, &layer.attn_norm_b); - patch_tensor(adapter, &layer.attn_norm_2); - patch_tensor(adapter, &layer.attn_norm_2_b); - patch_tensor(adapter, &layer.attn_q_norm); - patch_tensor(adapter, &layer.attn_q_norm_b); - patch_tensor(adapter, &layer.attn_k_norm); - patch_tensor(adapter, &layer.attn_k_norm_b); - patch_tensor(adapter, &layer.attn_out_norm); - patch_tensor(adapter, &layer.attn_out_norm_b); - patch_tensor(adapter, &layer.attn_q_a_norm); - patch_tensor(adapter, &layer.attn_kv_a_norm); - patch_tensor(adapter, &layer.attn_sub_norm); - patch_tensor(adapter, &layer.attn_post_norm); - patch_tensor(adapter, &layer.ffn_sub_norm); - patch_tensor(adapter, &layer.attn_norm_cross); - patch_tensor(adapter, &layer.attn_norm_enc); - - patch_tensor(adapter, &layer.wq); - patch_tensor(adapter, &layer.wk); - patch_tensor(adapter, &layer.wv); - patch_tensor(adapter, &layer.wo); - patch_tensor(adapter, &layer.wqkv); - patch_tensor(adapter, &layer.wq_a); - patch_tensor(adapter, &layer.wq_b); - patch_tensor(adapter, &layer.wkv_a_mqa); - patch_tensor(adapter, &layer.wkv_b); - - patch_tensor(adapter, &layer.bq); - patch_tensor(adapter, &layer.bk); - patch_tensor(adapter, &layer.bv); - patch_tensor(adapter, &layer.bo); - patch_tensor(adapter, &layer.bqkv); - - patch_tensor(adapter, &layer.ffn_norm); - patch_tensor(adapter, &layer.ffn_norm_b); - patch_tensor(adapter, &layer.ffn_post_norm); - patch_tensor(adapter, &layer.layer_out_norm); - patch_tensor(adapter, &layer.layer_out_norm_b); - patch_tensor(adapter, &layer.ffn_norm_exps); - patch_tensor(adapter, &layer.ffn_norm_enc); - - patch_tensor(adapter, &layer.ffn_gate); - patch_tensor(adapter, &layer.ffn_down); - patch_tensor(adapter, &layer.ffn_up); - patch_tensor(adapter, &layer.ffn_gate_enc); - patch_tensor(adapter, &layer.ffn_down_enc); - patch_tensor(adapter, &layer.ffn_up_enc); - - patch_tensor(adapter, &layer.ffn_gate_inp); - patch_tensor(adapter, &layer.ffn_gate_exps); - patch_tensor(adapter, &layer.ffn_down_exps); - patch_tensor(adapter, &layer.ffn_up_exps); - - patch_tensor(adapter, &layer.ffn_gate_inp_shexp); - patch_tensor(adapter, &layer.ffn_gate_shexp); - patch_tensor(adapter, &layer.ffn_down_shexp); - patch_tensor(adapter, &layer.ffn_up_shexp); - - patch_tensor(adapter, &layer.ffn_gate_b); - patch_tensor(adapter, &layer.ffn_down_b); - patch_tensor(adapter, &layer.ffn_up_b); - patch_tensor(adapter, &layer.ffn_act); - } + apply_lora(adapter, model.tok_embd); + apply_lora(adapter, model.type_embd); + apply_lora(adapter, model.pos_embd); + apply_lora(adapter, model.tok_norm); + apply_lora(adapter, model.tok_norm_b); + apply_lora(adapter, model.output_norm); + apply_lora(adapter, model.output_norm_b); + apply_lora(adapter, model.output); + apply_lora(adapter, model.output_b); + apply_lora(adapter, model.output_norm_enc); + for (const llama_layer & layer : model.layers) { + apply_lora(adapter, layer.attn_norm); + apply_lora(adapter, layer.attn_norm_b); + apply_lora(adapter, layer.attn_norm_2); + apply_lora(adapter, layer.attn_norm_2_b); + apply_lora(adapter, layer.attn_q_norm); + apply_lora(adapter, layer.attn_q_norm_b); + apply_lora(adapter, layer.attn_k_norm); + apply_lora(adapter, layer.attn_k_norm_b); + apply_lora(adapter, layer.attn_out_norm); + apply_lora(adapter, layer.attn_out_norm_b); + apply_lora(adapter, layer.attn_q_a_norm); + apply_lora(adapter, layer.attn_kv_a_norm); + apply_lora(adapter, layer.attn_sub_norm); + apply_lora(adapter, layer.attn_post_norm); + apply_lora(adapter, layer.ffn_sub_norm); + apply_lora(adapter, layer.attn_norm_cross); + apply_lora(adapter, layer.attn_norm_enc); + + apply_lora(adapter, layer.wq); + apply_lora(adapter, layer.wk); + apply_lora(adapter, layer.wv); + apply_lora(adapter, layer.wo); + apply_lora(adapter, layer.wqkv); + apply_lora(adapter, layer.wq_a); + apply_lora(adapter, layer.wq_b); + apply_lora(adapter, layer.wkv_a_mqa); + apply_lora(adapter, layer.wkv_b); + + apply_lora(adapter, layer.bq); + apply_lora(adapter, layer.bk); + apply_lora(adapter, layer.bv); + apply_lora(adapter, layer.bo); + apply_lora(adapter, layer.bqkv); + + apply_lora(adapter, layer.ffn_norm); + apply_lora(adapter, layer.ffn_norm_b); + apply_lora(adapter, layer.ffn_post_norm); + apply_lora(adapter, layer.layer_out_norm); + apply_lora(adapter, layer.layer_out_norm_b); + apply_lora(adapter, layer.ffn_norm_exps); + apply_lora(adapter, layer.ffn_norm_enc); + + apply_lora(adapter, layer.ffn_gate); + apply_lora(adapter, layer.ffn_down); + apply_lora(adapter, layer.ffn_up); + apply_lora(adapter, layer.ffn_gate_enc); + apply_lora(adapter, layer.ffn_down_enc); + apply_lora(adapter, layer.ffn_up_enc); + + apply_lora(adapter, layer.ffn_gate_inp); + apply_lora(adapter, layer.ffn_gate_exps); + apply_lora(adapter, layer.ffn_down_exps); + apply_lora(adapter, layer.ffn_up_exps); + + apply_lora(adapter, layer.ffn_gate_inp_shexp); + apply_lora(adapter, layer.ffn_gate_shexp); + apply_lora(adapter, layer.ffn_down_shexp); + apply_lora(adapter, layer.ffn_up_shexp); + + apply_lora(adapter, layer.ffn_gate_b); + apply_lora(adapter, layer.ffn_down_b); + apply_lora(adapter, layer.ffn_up_b); + apply_lora(adapter, layer.ffn_act); + } + + // merge lora to model weight + ggml_status res = ggml_backend_sched_graph_compute(lctx->sched, gf); + if (res == GGML_STATUS_SUCCESS) { + for (auto & out : output_nodes) { + struct ggml_tensor * model_tensor = out.first; + struct ggml_tensor * merged_tensor = out.second; + ggml_backend_tensor_copy(merged_tensor, model_tensor); + ggml_set_name(model_tensor, merged_tensor->name); + } + LLAMA_LOG_ERROR("%s: merged %ld lora weights to model\n", __func__, output_nodes.size()); + } else { + LLAMA_LOG_ERROR("%s: compute error while merging lora weights to model, result = %d\n", __func__, res); + return res; } + + ggml_free(ctx0); return 0; } @@ -19362,9 +19336,10 @@ uint32_t llama_model_quantize( } } -struct llama_lora_adapter * llama_lora_adapter_init(struct llama_context * ctx, const char * path_lora) { +struct llama_lora_adapter * llama_lora_adapter_init(struct llama_context * ctx, const char * path_lora, float scale) { try { struct llama_lora_adapter * adapter = new llama_lora_adapter; + adapter->scale = scale; int res = llama_lora_adapter_init_internal(ctx->model, path_lora, *adapter); if (res == 0) { ctx->lora_adapters.push_back(adapter); From b88ce0f8927427929e25f45a419623a55ca043f4 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 6 Jul 2024 15:06:32 +0200 Subject: [PATCH 5/8] correct ggml_backend_tensor_copy --- src/llama.cpp | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index de3d77485c0c2..5f02106d366a2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18446,9 +18446,10 @@ int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_ }; struct ggml_context * ctx0 = ggml_init(ctx0_params); + // map "merged.%s" name to model tensor + std::map output_map; // apply lora for model tensors struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - std::vector> output_nodes; auto apply_lora = [&](struct llama_lora_adapter * adapter, struct ggml_tensor * model_tensor) { if (model_tensor == nullptr) { return; @@ -18459,9 +18460,9 @@ int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_ struct ggml_tensor * cur = ggml_mul_mat(ctx0, lora_w.a, lora_w.b); cur = ggml_scale_inplace(ctx0, cur, adapter->scale); cur = ggml_add(ctx0, cur, model_tensor); - ggml_format_name(cur, "%s.merged", name.c_str()); + ggml_format_name(cur, "merged.%s", name.c_str()); ggml_build_forward_expand(gf, cur); - output_nodes.push_back({model_tensor, cur}); + output_map[std::string(cur->name)] = model_tensor; } }; apply_lora(adapter, model.tok_embd); @@ -18543,13 +18544,19 @@ int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_ // merge lora to model weight ggml_status res = ggml_backend_sched_graph_compute(lctx->sched, gf); if (res == GGML_STATUS_SUCCESS) { - for (auto & out : output_nodes) { - struct ggml_tensor * model_tensor = out.first; - struct ggml_tensor * merged_tensor = out.second; - ggml_backend_tensor_copy(merged_tensor, model_tensor); - ggml_set_name(model_tensor, merged_tensor->name); - } - LLAMA_LOG_ERROR("%s: merged %ld lora weights to model\n", __func__, output_nodes.size()); + // graph maybe realloc, we need to find correct gf->nodes based on name + size_t n_merged = 0; + for (int i = 0; i < gf->n_nodes; ++i) { + auto node = gf->nodes[i]; + std::string name(node->name); + if (output_map.find(name) != output_map.end()) { + struct ggml_tensor * model_tensor = output_map[name]; + ggml_backend_tensor_copy(node, model_tensor); + n_merged++; + } + } + GGML_ASSERT(n_merged == output_map.size()); + LLAMA_LOG_ERROR("%s: merged %ld lora weights to model\n", __func__, n_merged); } else { LLAMA_LOG_ERROR("%s: compute error while merging lora weights to model, result = %d\n", __func__, res); return res; From f6d090d7de2544be6a508d53630e791d9ce0751f Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 7 Jul 2024 16:01:05 +0200 Subject: [PATCH 6/8] add llm_build_mm --- common/common.cpp | 4 +- ggml/src/ggml.c | 2 +- include/llama.h | 24 ++- src/llama.cpp | 467 ++++++++++++++++++++-------------------------- 4 files changed, 220 insertions(+), 277 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index d5dd4d38d3cf0..ec5709f83fd5e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2063,14 +2063,14 @@ std::tuple llama_init_from_gpt_par for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); - auto adapter = llama_lora_adapter_init(lctx, lora_adapter.c_str(), lora_scale); + auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); if (adapter == nullptr) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); llama_free_model(model); return std::make_tuple(nullptr, nullptr); } - llama_lora_adapter_apply(lctx, adapter); + llama_lora_adapter_set(lctx, adapter, lora_scale); } if (params.ignore_eos) { diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2e09b7087e667..2093be2a98013 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -19339,7 +19339,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "digraph G {\n"); fprintf(fp, " newrank = true;\n"); - fprintf(fp, " rankdir = LR;\n"); + fprintf(fp, " rankdir = TB;\n"); for (int i = 0; i < gb->n_nodes; i++) { struct ggml_tensor * node = gb->nodes[i]; diff --git a/include/llama.h b/include/llama.h index 50ea0d84773bf..37140b7714788 100644 --- a/include/llama.h +++ b/include/llama.h @@ -508,19 +508,29 @@ extern "C" { const char * fname_out, const llama_model_quantize_params * params); - // Apply a LoRA adapter to a loaded model - // path_base_model is the path to a higher quality model to use as a base for - // the layers modified by the adapter. Can be NULL to use the current loaded model. - // The model needs to be reloaded before applying a new adapter, otherwise the adapter - // will be applied on top of the previous one + // Load a LoRA adapter from file + // The loaded adapter will be associated to the given model, and will be free when the model is deleted LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( + struct llama_model * model, + const char * path_lora); + + // Add a loaded LoRA adapter to given context + // This will not modify model's weight + LLAMA_API int32_t llama_lora_adapter_set( struct llama_context * ctx, - const char * path_lora, + struct llama_lora_adapter * adapter, float scale); - LLAMA_API int32_t llama_lora_adapter_apply( + + // Remove a LoRA adapter from given context + // Return -1 if the adapter is not present in the context + LLAMA_API int32_t llama_lora_adapter_remove( struct llama_context * ctx, struct llama_lora_adapter * adapter); + // Manually free a LoRA adapter + // Note: loaded adapters will be free when the associated model is deleted + LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter); + // Apply a loaded control vector to a llama_context, or if data is NULL, clear // the currently loaded vector. // n_embd should be the size of a single layer's control, and data should point diff --git a/src/llama.cpp b/src/llama.cpp index 5f02106d366a2..ee18ca847fde5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2547,30 +2547,6 @@ struct llama_control_vector { } }; -struct lora_weight { - struct ggml_tensor * a = nullptr; - struct ggml_tensor * b = nullptr; - lora_weight() {} - lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} -}; - -struct llama_lora_adapter { - // map tensor name to lora_a_b - std::map ab_map; - std::vector ctxs; - std::vector bufs; - float scale = 1.0f; - - ~llama_lora_adapter() { - for (struct ggml_context * ctx : ctxs) { - ggml_free(ctx); - } - for (ggml_backend_buffer_t buf : bufs) { - ggml_backend_buffer_free(buf); - } - } -}; - struct llama_vocab { using id = int32_t; using token = std::string; @@ -2703,9 +2679,8 @@ struct llama_model { int64_t t_load_us = 0; int64_t t_start_us = 0; - // used by lora, to save model's original tensors - std::vector orig_tensors; - std::vector orig_layers; + // keep track of loaded lora adapters + std::set lora_adapters; ~llama_model() { for (struct ggml_context * ctx : ctxs) { @@ -2719,6 +2694,9 @@ struct llama_model { #endif ggml_backend_buffer_free(buf); } + while (!lora_adapters.empty()) { + llama_lora_adapter_free(*lora_adapters.begin()); + } } }; @@ -2732,10 +2710,6 @@ struct llama_context { } ggml_backend_buffer_free(buf_output); - - for (auto adapter : lora_adapters) { - delete adapter; - } } llama_cparams cparams; @@ -2828,8 +2802,50 @@ struct llama_context { // control vectors struct llama_control_vector cvec; - // lora adapters - std::vector lora_adapters; + // lora adapters and scales + std::map lora_adapters; +}; + +struct lora_weight { + struct ggml_tensor * a = nullptr; + struct ggml_tensor * b = nullptr; + lora_weight() {} + lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} +}; + +struct llama_lora_adapter { + struct llama_model * base_model; + // map tensor name to lora_a_b + std::map ab_map; + std::vector ctxs; + std::vector bufs; + + llama_lora_adapter(struct llama_model * base_model): base_model(base_model) { + base_model->lora_adapters.insert(this); + } + + bool has_weight(struct ggml_tensor * w) { + std::string name(w->name); + return ab_map.find(name) != ab_map.end(); + } + + lora_weight & get_weight(struct ggml_tensor * w) { + std::string name(w->name); + return ab_map.at(name); + } + + ~llama_lora_adapter() { + for (struct ggml_context * ctx : ctxs) { + ggml_free(ctx); + } + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } + auto pos = base_model->lora_adapters.find(this); + if (pos != base_model->lora_adapters.end()) { + base_model->lora_adapters.erase(pos); + } + } }; static size_t llama_get_device_count(const llama_model & model) { @@ -7773,6 +7789,32 @@ static void llm_build_kv_store( ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view)); } +// do mat_mul, while optionally apply lora +static struct ggml_tensor * llm_build_mm( + struct llama_context & lctx, + struct ggml_context * ctx0, + struct ggml_tensor * w, + struct ggml_tensor * cur) { + struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + for (auto & it : lctx.lora_adapters) { + struct llama_lora_adapter * adapter = it.first; + float scale = it.second; + if (!adapter->has_weight(w)) { + continue; + } + struct lora_weight & lora = adapter->get_weight(w); + // TODO: check if lora_a need transpose + struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora.a)); + struct ggml_tensor * ab_cur = ggml_mul_mat( + ctx0, lora.b, + ggml_mul_mat(ctx0, a, cur) + ); + ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + return res; +} + static struct ggml_tensor * llm_build_norm( struct ggml_context * ctx, struct ggml_tensor * cur, @@ -7806,6 +7848,7 @@ static struct ggml_tensor * llm_build_norm( } static struct ggml_tensor * llm_build_ffn( + struct llama_context & lctx, struct ggml_context * ctx, struct ggml_tensor * cur, struct ggml_tensor * up, @@ -7822,7 +7865,7 @@ static struct ggml_tensor * llm_build_ffn( llm_ffn_gate_type type_gate, const llm_build_cb & cb, int il) { - struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur; + struct ggml_tensor * tmp = up ? llm_build_mm(lctx, ctx, up, cur) : cur; cb(tmp, "ffn_up", il); if (up_b) { @@ -7839,12 +7882,12 @@ static struct ggml_tensor * llm_build_ffn( switch (type_gate) { case LLM_FFN_SEQ: { - cur = ggml_mul_mat(ctx, gate, tmp); + cur = llm_build_mm(lctx, ctx, gate, tmp); cb(cur, "ffn_gate", il); } break; case LLM_FFN_PAR: { - cur = ggml_mul_mat(ctx, gate, cur); + cur = llm_build_mm(lctx, ctx, gate, cur); cb(cur, "ffn_gate", il); } break; } @@ -7899,7 +7942,7 @@ static struct ggml_tensor * llm_build_ffn( } if (down) { - cur = ggml_mul_mat(ctx, down, cur); + cur = llm_build_mm(lctx, ctx, down, cur); } if (down_b) { @@ -7919,6 +7962,7 @@ static struct ggml_tensor * llm_build_ffn( } static struct ggml_tensor * llm_build_moe_ffn( + struct llama_context & lctx, struct ggml_context * ctx, struct ggml_tensor * cur, struct ggml_tensor * gate_inp, @@ -7936,7 +7980,7 @@ static struct ggml_tensor * llm_build_moe_ffn( int64_t n_embd = cur->ne[0]; int64_t n_tokens = cur->ne[1]; - ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens] + ggml_tensor * logits = llm_build_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens] cb(logits, "ffn_moe_logits", il); ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] @@ -8019,6 +8063,7 @@ static struct ggml_tensor * llm_build_moe_ffn( } static struct ggml_tensor * llm_build_kqv( + struct llama_context & lctx, struct ggml_context * ctx, const llama_model & model, const llama_hparams & hparams, @@ -8076,7 +8121,7 @@ static struct ggml_tensor * llm_build_kqv( cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { - struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + struct ggml_tensor * kq = llm_build_mm(lctx, ctx, k, q); cb(kq, "kq", il); if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { @@ -8119,7 +8164,7 @@ static struct ggml_tensor * llm_build_kqv( 0); cb(v, "v", il); - struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); + struct ggml_tensor * kqv = llm_build_mm(lctx, ctx, v, kq); cb(kqv, "kqv", il); struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); @@ -8132,7 +8177,7 @@ static struct ggml_tensor * llm_build_kqv( ggml_build_forward_expand(graph, cur); if (wo) { - cur = ggml_mul_mat(ctx, wo, cur); + cur = llm_build_mm(lctx, ctx, wo, cur); } if (wo_b) { @@ -8147,6 +8192,7 @@ static struct ggml_tensor * llm_build_kqv( } static struct ggml_tensor * llm_build_kv( + struct llama_context & lctx, struct ggml_context * ctx, const llama_model & model, const llama_hparams & hparams, @@ -8176,7 +8222,7 @@ static struct ggml_tensor * llm_build_kv( struct ggml_tensor * cur; - cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b, + cur = llm_build_kqv(lctx, ctx, model, hparams, cparams, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il); cb(cur, "kqv_out", il); @@ -8638,21 +8684,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -8673,7 +8719,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -8696,7 +8742,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -8710,7 +8756,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -8740,7 +8786,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -8808,7 +8854,7 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -8830,7 +8876,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -8913,7 +8959,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -8935,7 +8981,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9034,7 +9080,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9051,7 +9097,7 @@ struct llm_build_context { // feed forward { - cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result + cur = llm_build_ffn(lctx, ctx0, attn_norm, // !! use the attn norm, not the result model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9158,7 +9204,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -9190,7 +9236,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -9308,7 +9354,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9331,7 +9377,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "attn_out_norm", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -9418,7 +9464,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9442,7 +9488,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -9512,7 +9558,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cb(Qcur, "Qcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9534,7 +9580,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9719,21 +9765,21 @@ struct llm_build_context { // feed-forward network if (model.arch == LLM_ARCH_BERT) { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, cb, il); } else { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9807,7 +9853,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9831,7 +9877,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -9939,13 +9985,13 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } else { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9969,7 +10015,7 @@ struct llm_build_context { model.layers[il].ffn_norm_b, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -10090,7 +10136,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10118,7 +10164,7 @@ struct llm_build_context { // parallel residual cur = inpSA; } - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10209,7 +10255,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10231,7 +10277,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10323,7 +10369,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10344,7 +10390,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10438,7 +10484,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10461,7 +10507,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, cur, + llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -10481,7 +10527,7 @@ struct llm_build_context { ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp); cb(cur_gate, "ffn_shexp_gate", il); - ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur, + ggml_tensor * cur_ffn = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -10595,7 +10641,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -10610,7 +10656,7 @@ struct llm_build_context { // FF { - ffn_output = llm_build_ffn(ctx0, attn_norm_output, + ffn_output = llm_build_ffn(lctx, ctx0, attn_norm_output, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -10715,7 +10761,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -10830,7 +10876,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10848,7 +10894,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10932,7 +10978,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10956,7 +11002,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -11043,7 +11089,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11067,7 +11113,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -11163,7 +11209,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11184,7 +11230,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11281,7 +11327,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11302,7 +11348,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11412,7 +11458,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11439,7 +11485,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11534,7 +11580,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -11556,7 +11602,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11647,7 +11693,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -11674,7 +11720,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11784,7 +11830,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11806,7 +11852,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -12077,7 +12123,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12094,7 +12140,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, ffn_inp, + cur = llm_build_ffn(lctx, ctx0, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12209,7 +12255,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12231,7 +12277,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12336,7 +12382,7 @@ struct llm_build_context { Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); cb(Qcur, "Vcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12358,7 +12404,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12445,7 +12491,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12470,7 +12516,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -12501,7 +12547,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -12588,7 +12634,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12610,7 +12656,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12627,7 +12673,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm_exps", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -12810,7 +12856,7 @@ struct llm_build_context { struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } @@ -12832,7 +12878,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12847,7 +12893,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, cur, + llm_build_moe_ffn(lctx, ctx0, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -12860,7 +12906,7 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur, + ggml_tensor * ffn_shexp = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -12965,7 +13011,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, NULL, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); @@ -12998,7 +13044,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale, NULL, NULL, NULL, @@ -13132,7 +13178,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up_enc, NULL, NULL, model.layers[il].ffn_gate_enc, NULL, NULL, model.layers[il].ffn_down_enc, NULL, NULL, @@ -13310,7 +13356,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -13392,7 +13438,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il); } @@ -13416,7 +13462,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -18278,7 +18324,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } -static int llama_lora_adapter_init_internal(const struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) { +static int llama_lora_adapter_init_internal(const struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { static const int n_inp_tensors = 5; // see llama_model static const int n_out_tensors = 5; // see llama_model LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); @@ -18310,11 +18356,11 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co // count layer buffer types std::map buft_tensor_count; - for (int64_t i = 0; i < model.hparams.n_layer; i++) { - buft_tensor_count[model.buft_layer[i].buft] += n_tensors_per_layer; + for (int64_t i = 0; i < model->hparams.n_layer; i++) { + buft_tensor_count[model->buft_layer[i].buft] += n_tensors_per_layer; } - buft_tensor_count[model.buft_input.buft] += n_inp_tensors; - buft_tensor_count[model.buft_output.buft] += n_out_tensors; + buft_tensor_count[model->buft_input.buft] += n_inp_tensors; + buft_tensor_count[model->buft_output.buft] += n_out_tensors; // allocate contexts std::map ctx_map; @@ -18371,11 +18417,11 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co sscanf(cname, "blk.%d.", &il); struct ggml_context * dev_ctx; // device ctx if (il >= 0) { - dev_ctx = ctx_map.at(model.buft_layer[il].buft); + dev_ctx = ctx_map.at(model->buft_layer[il].buft); } else if (strstr(cname, "tok") == 0) { - dev_ctx = ctx_map.at(model.buft_input.buft); + dev_ctx = ctx_map.at(model->buft_input.buft); } else if (strstr(cname, "output") == 0) { - dev_ctx = ctx_map.at(model.buft_output.buft); + dev_ctx = ctx_map.at(model->buft_output.buft); } else { LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cname); continue; @@ -18436,134 +18482,27 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co return 0; } -int32_t llama_lora_adapter_apply(struct llama_context * lctx, struct llama_lora_adapter * adapter) { - GGML_ASSERT(!lctx->lora_adapters.empty()); - const struct llama_model & model = lctx->model; - struct ggml_init_params ctx0_params = { - /*.mem_size =*/ lctx->buf_compute_meta.size(), - /*.mem_buffer =*/ lctx->buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - struct ggml_context * ctx0 = ggml_init(ctx0_params); - - // map "merged.%s" name to model tensor - std::map output_map; - // apply lora for model tensors - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - auto apply_lora = [&](struct llama_lora_adapter * adapter, struct ggml_tensor * model_tensor) { - if (model_tensor == nullptr) { - return; - } - std::string name = ggml_get_name(model_tensor); - if (adapter->ab_map.find(name) != adapter->ab_map.end()) { - auto lora_w = adapter->ab_map[name]; - struct ggml_tensor * cur = ggml_mul_mat(ctx0, lora_w.a, lora_w.b); - cur = ggml_scale_inplace(ctx0, cur, adapter->scale); - cur = ggml_add(ctx0, cur, model_tensor); - ggml_format_name(cur, "merged.%s", name.c_str()); - ggml_build_forward_expand(gf, cur); - output_map[std::string(cur->name)] = model_tensor; - } - }; - apply_lora(adapter, model.tok_embd); - apply_lora(adapter, model.type_embd); - apply_lora(adapter, model.pos_embd); - apply_lora(adapter, model.tok_norm); - apply_lora(adapter, model.tok_norm_b); - apply_lora(adapter, model.output_norm); - apply_lora(adapter, model.output_norm_b); - apply_lora(adapter, model.output); - apply_lora(adapter, model.output_b); - apply_lora(adapter, model.output_norm_enc); - for (const llama_layer & layer : model.layers) { - apply_lora(adapter, layer.attn_norm); - apply_lora(adapter, layer.attn_norm_b); - apply_lora(adapter, layer.attn_norm_2); - apply_lora(adapter, layer.attn_norm_2_b); - apply_lora(adapter, layer.attn_q_norm); - apply_lora(adapter, layer.attn_q_norm_b); - apply_lora(adapter, layer.attn_k_norm); - apply_lora(adapter, layer.attn_k_norm_b); - apply_lora(adapter, layer.attn_out_norm); - apply_lora(adapter, layer.attn_out_norm_b); - apply_lora(adapter, layer.attn_q_a_norm); - apply_lora(adapter, layer.attn_kv_a_norm); - apply_lora(adapter, layer.attn_sub_norm); - apply_lora(adapter, layer.attn_post_norm); - apply_lora(adapter, layer.ffn_sub_norm); - apply_lora(adapter, layer.attn_norm_cross); - apply_lora(adapter, layer.attn_norm_enc); - - apply_lora(adapter, layer.wq); - apply_lora(adapter, layer.wk); - apply_lora(adapter, layer.wv); - apply_lora(adapter, layer.wo); - apply_lora(adapter, layer.wqkv); - apply_lora(adapter, layer.wq_a); - apply_lora(adapter, layer.wq_b); - apply_lora(adapter, layer.wkv_a_mqa); - apply_lora(adapter, layer.wkv_b); - - apply_lora(adapter, layer.bq); - apply_lora(adapter, layer.bk); - apply_lora(adapter, layer.bv); - apply_lora(adapter, layer.bo); - apply_lora(adapter, layer.bqkv); - - apply_lora(adapter, layer.ffn_norm); - apply_lora(adapter, layer.ffn_norm_b); - apply_lora(adapter, layer.ffn_post_norm); - apply_lora(adapter, layer.layer_out_norm); - apply_lora(adapter, layer.layer_out_norm_b); - apply_lora(adapter, layer.ffn_norm_exps); - apply_lora(adapter, layer.ffn_norm_enc); - - apply_lora(adapter, layer.ffn_gate); - apply_lora(adapter, layer.ffn_down); - apply_lora(adapter, layer.ffn_up); - apply_lora(adapter, layer.ffn_gate_enc); - apply_lora(adapter, layer.ffn_down_enc); - apply_lora(adapter, layer.ffn_up_enc); - - apply_lora(adapter, layer.ffn_gate_inp); - apply_lora(adapter, layer.ffn_gate_exps); - apply_lora(adapter, layer.ffn_down_exps); - apply_lora(adapter, layer.ffn_up_exps); - - apply_lora(adapter, layer.ffn_gate_inp_shexp); - apply_lora(adapter, layer.ffn_gate_shexp); - apply_lora(adapter, layer.ffn_down_shexp); - apply_lora(adapter, layer.ffn_up_shexp); - - apply_lora(adapter, layer.ffn_gate_b); - apply_lora(adapter, layer.ffn_down_b); - apply_lora(adapter, layer.ffn_up_b); - apply_lora(adapter, layer.ffn_act); - } - - // merge lora to model weight - ggml_status res = ggml_backend_sched_graph_compute(lctx->sched, gf); - if (res == GGML_STATUS_SUCCESS) { - // graph maybe realloc, we need to find correct gf->nodes based on name - size_t n_merged = 0; - for (int i = 0; i < gf->n_nodes; ++i) { - auto node = gf->nodes[i]; - std::string name(node->name); - if (output_map.find(name) != output_map.end()) { - struct ggml_tensor * model_tensor = output_map[name]; - ggml_backend_tensor_copy(node, model_tensor); - n_merged++; - } - } - GGML_ASSERT(n_merged == output_map.size()); - LLAMA_LOG_ERROR("%s: merged %ld lora weights to model\n", __func__, n_merged); - } else { - LLAMA_LOG_ERROR("%s: compute error while merging lora weights to model, result = %d\n", __func__, res); - return res; +int32_t llama_lora_adapter_set( + struct llama_context * ctx, + struct llama_lora_adapter * adapter, + float scale) { + ctx->lora_adapters[adapter] = scale; + return 0; +} + +int32_t llama_lora_adapter_remove( + struct llama_context * ctx, + struct llama_lora_adapter * adapter) { + auto pos = ctx->lora_adapters.find(adapter); + if (pos != ctx->lora_adapters.end()) { + ctx->lora_adapters.erase(pos); + return 0; } + return -1; +} - ggml_free(ctx0); - return 0; +void llama_lora_adapter_free(struct llama_lora_adapter * adapter) { + delete adapter; } // @@ -19343,17 +19282,11 @@ uint32_t llama_model_quantize( } } -struct llama_lora_adapter * llama_lora_adapter_init(struct llama_context * ctx, const char * path_lora, float scale) { +struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) { try { - struct llama_lora_adapter * adapter = new llama_lora_adapter; - adapter->scale = scale; - int res = llama_lora_adapter_init_internal(ctx->model, path_lora, *adapter); - if (res == 0) { - ctx->lora_adapters.push_back(adapter); - return adapter; - } else { - return nullptr; - } + struct llama_lora_adapter * adapter = new llama_lora_adapter(model); + int res = llama_lora_adapter_init_internal(model, path_lora, *adapter); + return res == 0 ? adapter : nullptr; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); return nullptr; From 30faf1f3def8ce627225f2401fb403d95907a47d Mon Sep 17 00:00:00 2001 From: ngxson Date: Sun, 7 Jul 2024 16:36:50 +0200 Subject: [PATCH 7/8] fix auto merge --- src/llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 7c79e4900dfca..ffc8ffbd23740 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -10863,7 +10863,7 @@ struct llm_build_context { // special-case: the up and gate tensors are merged into a single tensor // TOOD: support into llm_build_ffn { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -13622,7 +13622,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(lctx, ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); @@ -13647,7 +13647,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(lctx, ctx0, cur, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, From 79e2982788b0102aabb098b1a3d6227a7e32a483 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 8 Jul 2024 11:59:01 +0200 Subject: [PATCH 8/8] update based on review comments --- src/llama.cpp | 106 +++++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index ffc8ffbd23740..a4ceb0959caa2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2821,20 +2821,20 @@ struct llama_context { struct llama_control_vector cvec; // lora adapters and scales - std::map lora_adapters; + std::unordered_map lora_adapters; }; -struct lora_weight { +struct llama_lora_weight { struct ggml_tensor * a = nullptr; struct ggml_tensor * b = nullptr; - lora_weight() {} - lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} + llama_lora_weight() {} + llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} }; struct llama_lora_adapter { struct llama_model * base_model; // map tensor name to lora_a_b - std::map ab_map; + std::unordered_map ab_map; std::vector ctxs; std::vector bufs; @@ -2842,14 +2842,13 @@ struct llama_lora_adapter { base_model->lora_adapters.insert(this); } - bool has_weight(struct ggml_tensor * w) { + llama_lora_weight * get_weight(struct ggml_tensor * w) { std::string name(w->name); - return ab_map.find(name) != ab_map.end(); - } - - lora_weight & get_weight(struct ggml_tensor * w) { - std::string name(w->name); - return ab_map.at(name); + auto pos = ab_map.find(name); + if (ab_map.find(name) != ab_map.end()) { + return &pos->second; + } + return nullptr; } ~llama_lora_adapter() { @@ -7855,23 +7854,22 @@ static void llm_build_kv_store( } // do mat_mul, while optionally apply lora -static struct ggml_tensor * llm_build_mm( +static struct ggml_tensor * llm_build_lora_mm( struct llama_context & lctx, struct ggml_context * ctx0, struct ggml_tensor * w, struct ggml_tensor * cur) { struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); for (auto & it : lctx.lora_adapters) { - struct llama_lora_adapter * adapter = it.first; + struct llama_lora_weight * lora = it.first->get_weight(w); float scale = it.second; - if (!adapter->has_weight(w)) { + if (lora == nullptr) { continue; } - struct lora_weight & lora = adapter->get_weight(w); // TODO: check if lora_a need transpose - struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora.a)); + struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora->a)); struct ggml_tensor * ab_cur = ggml_mul_mat( - ctx0, lora.b, + ctx0, lora->b, ggml_mul_mat(ctx0, a, cur) ); ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale); @@ -7930,7 +7928,7 @@ static struct ggml_tensor * llm_build_ffn( llm_ffn_gate_type type_gate, const llm_build_cb & cb, int il) { - struct ggml_tensor * tmp = up ? llm_build_mm(lctx, ctx, up, cur) : cur; + struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur; cb(tmp, "ffn_up", il); if (up_b) { @@ -7947,12 +7945,12 @@ static struct ggml_tensor * llm_build_ffn( switch (type_gate) { case LLM_FFN_SEQ: { - cur = llm_build_mm(lctx, ctx, gate, tmp); + cur = llm_build_lora_mm(lctx, ctx, gate, tmp); cb(cur, "ffn_gate", il); } break; case LLM_FFN_PAR: { - cur = llm_build_mm(lctx, ctx, gate, cur); + cur = llm_build_lora_mm(lctx, ctx, gate, cur); cb(cur, "ffn_gate", il); } break; } @@ -8020,7 +8018,7 @@ static struct ggml_tensor * llm_build_ffn( } if (down) { - cur = llm_build_mm(lctx, ctx, down, cur); + cur = llm_build_lora_mm(lctx, ctx, down, cur); } if (down_b) { @@ -8058,7 +8056,7 @@ static struct ggml_tensor * llm_build_moe_ffn( int64_t n_embd = cur->ne[0]; int64_t n_tokens = cur->ne[1]; - ggml_tensor * logits = llm_build_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens] + ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens] cb(logits, "ffn_moe_logits", il); ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] @@ -8199,7 +8197,7 @@ static struct ggml_tensor * llm_build_kqv( cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens); } else { - struct ggml_tensor * kq = llm_build_mm(lctx, ctx, k, q); + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { @@ -8242,7 +8240,7 @@ static struct ggml_tensor * llm_build_kqv( 0); cb(v, "v", il); - struct ggml_tensor * kqv = llm_build_mm(lctx, ctx, v, kq); + struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); cb(kqv, "kqv", il); struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); @@ -8255,7 +8253,7 @@ static struct ggml_tensor * llm_build_kqv( ggml_build_forward_expand(graph, cur); if (wo) { - cur = llm_build_mm(lctx, ctx, wo, cur); + cur = llm_build_lora_mm(lctx, ctx, wo, cur); } if (wo_b) { @@ -8762,21 +8760,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_mm(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = llm_build_mm(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = llm_build_mm(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -8864,7 +8862,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = llm_build_mm(lctx, ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -18517,7 +18515,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } -static int llama_lora_adapter_init_internal(const struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { +static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { static const int n_inp_tensors = 5; // see llama_model static const int n_out_tensors = 5; // see llama_model LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); @@ -18532,7 +18530,7 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params); if (!ctx_gguf) { LLAMA_LOG_ERROR("%s: failed to load lora adapter file from %s\n", __func__, path_lora); - return -1; + throw std::exception(); } // calculate n_tensors_per_layer @@ -18574,7 +18572,7 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co } // bundle lora_a and lora_b into pairs - std::map ab_map; + std::map ab_map; auto str_endswith = [](const std::string & str, const std::string & suffix) { return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; }; @@ -18583,18 +18581,19 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co if (str_endswith(name, ".lora_a")) { replace_all(name, ".lora_a", ""); if (ab_map.find(name) == ab_map.end()) { - ab_map[name] = lora_weight(cur, nullptr); + ab_map[name] = llama_lora_weight(cur, nullptr); } else { ab_map[name].a = cur; } } else if (str_endswith(name, ".lora_b")) { replace_all(name, ".lora_b", ""); if (ab_map.find(name) == ab_map.end()) { - ab_map[name] = lora_weight(nullptr, cur); + ab_map[name] = llama_lora_weight(nullptr, cur); } else { ab_map[name].b = cur; } } else { + // maybe "optimizer.*"" tensors LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name); } } @@ -18603,28 +18602,26 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co for (auto & it : ab_map) { std::string name = it.first; const char * cname = name.c_str(); - lora_weight & w = it.second; + llama_lora_weight & w = it.second; GGML_ASSERT(w.a != nullptr); GGML_ASSERT(w.b != nullptr); int il = -1; sscanf(cname, "blk.%d.", &il); - struct ggml_context * dev_ctx; // device ctx - if (il >= 0) { - dev_ctx = ctx_map.at(model->buft_layer[il].buft); - } else if (strstr(cname, "tok") == 0) { - dev_ctx = ctx_map.at(model->buft_input.buft); - } else if (strstr(cname, "output") == 0) { - dev_ctx = ctx_map.at(model->buft_output.buft); - } else { - LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cname); - continue; + // device buft and device ctx + auto model_tensor = llama_get_model_tensor(model, cname); + if (!model_tensor) { + gguf_free(ctx_gguf); + ggml_free(ctx); + throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model\n"); } + struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer)); + // TODO: validate tensor shape // LLAMA_LOG_INFO("%s %p %p\n", cname, w.a, w.b); struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); ggml_set_name(tensor_a, w.a->name); ggml_set_name(tensor_b, w.b->name); - adapter.ab_map[name] = lora_weight(tensor_a, tensor_b); + adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b); } // allocate tensors / buffers and zero @@ -18636,8 +18633,9 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co ggml_context * ctx_dev = it.second; ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft); if (!buf) { - LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora adapter\n", __func__); - return -1; + gguf_free(ctx_gguf); + ggml_free(ctx); + throw std::runtime_error("failed to allocate buffer for lora adapter\n"); } ggml_backend_buffer_clear(buf, 0); adapter.ctxs.push_back(ctx_dev); @@ -18671,14 +18669,18 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2); // free ctx for reading gguf + gguf_free(ctx_gguf); ggml_free(ctx); - return 0; } int32_t llama_lora_adapter_set( struct llama_context * ctx, struct llama_lora_adapter * adapter, float scale) { + if (ctx->cparams.flash_attn) { + LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__); + return -1; + } ctx->lora_adapters[adapter] = scale; return 0; } @@ -19479,8 +19481,8 @@ uint32_t llama_model_quantize( struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) { try { struct llama_lora_adapter * adapter = new llama_lora_adapter(model); - int res = llama_lora_adapter_init_internal(model, path_lora, *adapter); - return res == 0 ? adapter : nullptr; + llama_lora_adapter_init_internal(model, path_lora, *adapter); + return adapter; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); return nullptr;