From 12112bfa480cca0e451c979103183511b02d1a13 Mon Sep 17 00:00:00 2001 From: ltoniazzi Date: Fri, 21 Jun 2024 16:44:33 +0100 Subject: [PATCH 01/14] Add basic cpu setup --- BRANCH_SETUP.md | 48 ++++++++ common/common.cpp | 8 ++ common/common.h | 1 + data/hot-lora.txt | 2 + ggml.c | 46 +++++++ ggml.h | 19 +++ llama.cpp | 299 +++++++++++++++++++++++++++++++++++++++++++++- llama.h | 3 + 8 files changed, 423 insertions(+), 3 deletions(-) create mode 100644 BRANCH_SETUP.md create mode 100644 data/hot-lora.txt diff --git a/BRANCH_SETUP.md b/BRANCH_SETUP.md new file mode 100644 index 0000000000000..dac58d0d2de31 --- /dev/null +++ b/BRANCH_SETUP.md @@ -0,0 +1,48 @@ +# Setup this branch + +## Create a lora adpter bin file + +0. `mkdir models/open-llama` and download [Open-llama (all files)](https://huggingface.co/openlm-research/open_llama_3b_v2/tree/main) in the folder `./models/open-llama` + +2. `mkdir data && touch data/hot-lora.txt` and write a couple of words in it. + +3. Run: + ```bash + # Convert base model to gguf + python3 convert-hf-to-gguf.py models/open-llama/ + # Quantize base model + ./quantize ./models/open-llama/ggml-model-f16.gguf ./models/open-llama/ggml-model-q8_0.gguf Q8_0 + # Obtain Lora adapter + ./finetune --model-base models/open-llama/ggml-model-q8_0.gguf \ + --checkpoint-in models/open-llama/chk-lora-ggml-model-q8_0-hot-lora-LATEST.gguf \ + --checkpoint-out models/open-llama/chk-lora-ggml-model-q8_0-hot-lora-ITERATION.gguf \ + --lora-out models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \ + --train-data "data/hot-lora.txt" \ + --save-every 1 \ + --threads 1 \ + --adam-iter 1 \ + --batch 1 \ + --ctx 16 \ + --use-checkpointing + ``` + +## Run main with adapter + +Run main with base model and lora adapter to hot-swap +```bash +./main ./models/open-llama/ggml-model-f16.gguf \ +--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \ +-ngl 0 \ +-n 128 +``` + +With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (`lora_mul_mat`), but they are not moved to the buffer of the base tensors. + +# Logic + + + +# Current status + +- Only ony Lora adapter can be passed. +- GPU not supported \ No newline at end of file diff --git a/common/common.cpp b/common/common.cpp index 1591790e6df4c..494258db0ed48 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -789,6 +789,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.model = argv[i]; return true; } + if (arg == "-hl" || arg == "--hot-lora") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.hot_lora = argv[i]; + return true; + } if (arg == "-md" || arg == "--model-draft") { if (++i >= argc) { invalid_param = true; diff --git a/common/common.h b/common/common.h index 2345d855eed3c..cd9d6370cf47f 100644 --- a/common/common.h +++ b/common/common.h @@ -100,6 +100,7 @@ struct gpt_params { std::string model = ""; // model path std::string model_draft = ""; // draft model for speculative decoding + std::string hot_lora = ""; // lora model path for hot swapping std::string model_alias = "unknown"; // model alias std::string model_url = ""; // model url to download std::string hf_repo = ""; // HF repo diff --git a/data/hot-lora.txt b/data/hot-lora.txt new file mode 100644 index 0000000000000..c43186710e906 --- /dev/null +++ b/data/hot-lora.txt @@ -0,0 +1,2 @@ + + how are you? diff --git a/ggml.c b/ggml.c index 1fc77743bc7b9..0fb8dafbd2ab5 100644 --- a/ggml.c +++ b/ggml.c @@ -4313,6 +4313,52 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam return NULL; } +//////// LORA + +struct lora_tensor_pair* build_lora_weights_map(struct ggml_context* ctx) { + struct lora_tensor_pair* pair = malloc(sizeof(struct lora_tensor_pair)); + if (!pair) return NULL; + pair->pairs = NULL; + pair->count = 0; + pair->capacity = 0; + + struct ggml_object * obj = ctx->objects_begin; + char * const mem_buffer = ctx->mem_buffer; + + while (obj != NULL) { + if (obj->type == GGML_OBJECT_TYPE_TENSOR) { + struct ggml_tensor * tensor = (struct ggml_tensor *)(mem_buffer + obj->offs); + char * tensor_name = tensor->name; + + if (strlen(tensor_name) > 6 && (strcmp(tensor_name + strlen(tensor_name) - 6, ".loraA") == 0 || + strcmp(tensor_name + strlen(tensor_name) - 6, ".loraB") == 0)) { + if (pair->count == pair->capacity) { + pair->capacity = pair->capacity > 0 ? pair->capacity * 2 : 4; + pair->pairs = realloc(pair->pairs, pair->capacity * sizeof(struct lora_tensor_info)); + } + + pair->pairs[pair->count].name = strdup(tensor_name); + pair->pairs[pair->count].tensor = tensor; + pair->count++; + } + } + obj = obj->next; + } + + return pair; +} + +void free_lora_tensor_pair(struct lora_tensor_pair* pair) { + if (!pair) return; + for (int i = 0; i < pair->count; i++) { + free(pair->pairs[i].name); + } + free(pair->pairs); + free(pair); +} + +//////// LORA + //////////////////////////////////////////////////////////////////////////////// // ggml_dup diff --git a/ggml.h b/ggml.h index 13502a3622fc4..d843699084840 100644 --- a/ggml.h +++ b/ggml.h @@ -835,6 +835,25 @@ extern "C" { GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); + struct lora_tensor_info { + char* name; + struct ggml_tensor* tensor; + }; + + struct lora_tensor_pair { + struct lora_tensor_info* pairs; // Dynamic array of tensor pairs + int count; + int capacity; + }; + + // Function to build tensor pairs + struct lora_tensor_pair* build_lora_weights_map(struct ggml_context* ctx); + + // Cleanup function for lora_tensor_pair + void free_lora_tensor_pair(struct lora_tensor_pair* pair); + + + GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); diff --git a/llama.cpp b/llama.cpp index 8b675ea993a38..58b6ff8640447 100644 --- a/llama.cpp +++ b/llama.cpp @@ -119,6 +119,212 @@ static void llama_log_callback_default(ggml_log_level level, const char * text, // helpers // +///////// LORA + +struct lora_weights { + ggml_tensor* loraA; + ggml_tensor* loraB; +}; + +struct export_lora_params { + std::string fn_model_base; + std::string fn_model_out; + std::vector lora; + int n_threads; +}; + +static struct export_lora_params get_default_export_lora_params() { + struct export_lora_params result; + result.fn_model_base = ""; + result.fn_model_out = ""; + result.n_threads = GGML_DEFAULT_N_THREADS; + return result; +} + +struct lora_info { + std::string filename; + float scale; +}; +// TODO lora_data should maybe sub lora_weights in llama.cpp +struct lora_data { + struct lora_info info; + std::vector data; + struct ggml_context * ctx; + + uint32_t lora_r; + uint32_t lora_alpha; +}; + +struct llama_file_lora { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file_lora(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + size = 0; + } else { + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + die_fmt("read error: %s", strerror(errno)); + } + if (ret != 1) { + die("unexpectedly reached end of file"); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, size, 1, fp); + if (ret != 1) { + die_fmt("write error: %s", strerror(errno)); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + bool eof() { + return tell() >= size; + } + + ~llama_file_lora() { + if (fp) { + std::fclose(fp); + } + } +}; + +static void free_lora(struct lora_data * lora) { + if (lora->ctx != NULL) { + ggml_free(lora->ctx); + } + delete lora; +} + +static struct lora_data * load_lora(struct lora_info * info) { + struct lora_data * result = new struct lora_data; + result->info = *info; + result->ctx = NULL; + result->lora_r = 1; + result->lora_alpha = 1; + + struct llama_file_lora file(info->filename.c_str(), "rb"); + if (file.fp == NULL) { + fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n", + info->filename.c_str()); + free_lora(result); + return NULL; + } + + struct ggml_init_params params_ggml; + params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE; + params_ggml.mem_buffer = NULL; + params_ggml.no_alloc = true; + result->ctx = ggml_init(params_ggml); + + uint32_t magic = file.read_u32(); + if (magic != LLAMA_FILE_MAGIC_GGLA) { + die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str()); + } + uint32_t version = file.read_u32(); + if (version != 1) { + die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str()); + } + result->lora_r = file.read_u32(); + result->lora_alpha = file.read_u32(); + // read tensor infos from file + std::vector name_buf; + std::vector tensors; + std::vector tensors_offset; + size_t total_nbytes_pad = 0; + while(!file.eof()) { + int64_t ne[4] = {1,1,1,1}; + uint32_t n_dims = file.read_u32(); + uint32_t namelen = file.read_u32(); + uint32_t type = file.read_u32(); + for (uint32_t k = 0; k < n_dims; ++k) { + ne[k] = (int64_t)file.read_u32(); + } + name_buf.clear(); + name_buf.resize(namelen + 1, '\0'); + file.read_raw(name_buf.data(), namelen); + file.seek((0-file.tell()) & 31, SEEK_CUR); + size_t offset = file.tell(); + struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne); + ggml_set_name(tensor, name_buf.data()); + size_t nbytes = ggml_nbytes(tensor); + size_t nbytes_pad = ggml_nbytes_pad(tensor); + total_nbytes_pad += nbytes_pad; + tensors.push_back(tensor); + tensors_offset.push_back(offset); + file.seek(nbytes, SEEK_CUR); + } + // read tensor data + result->data.resize(total_nbytes_pad); + size_t data_offset = 0; + for (size_t i = 0; i < tensors.size(); ++i) { + struct ggml_tensor * tensor = tensors[i]; + size_t offset = tensors_offset[i]; + size_t nbytes = ggml_nbytes(tensor); + size_t nbytes_pad = ggml_nbytes_pad(tensor); + file.seek(offset, SEEK_SET); + tensor->data = result->data.data() + data_offset; + file.read_raw(tensor->data, nbytes); + data_offset += nbytes_pad; + } + return result; +} + +///////// LORA + static size_t utf8_len(char src) { const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; uint8_t highbits = static_cast(src) >> 4; @@ -2295,6 +2501,10 @@ struct llama_context { } llama_cparams cparams; + bool lora_loaded = false; + std::map lora_weights_map; + lora_data llora_data; + float lora_scale = 1.0f; std::vector backends; #ifdef GGML_USE_METAL @@ -7447,21 +7657,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = lora_mul_mat(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = lora_mul_mat(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = lora_mul_mat(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -9470,6 +9680,35 @@ struct llm_build_context { return gf; } + static ggml_tensor * lora_mul_mat( + llama_context & lctx, + ggml_context * ctx0, + ggml_tensor * weight, + ggml_tensor * cur) { + ggml_tensor * mm = ggml_mul_mat(ctx0, weight, cur); + + auto it = lctx.lora_weights_map.find(weight->name); + if (it == lctx.lora_weights_map.end()) { + return mm; + } + + ggml_tensor * loraA = it->second.loraA; + ggml_tensor * loraB = it->second.loraB; + + ggml_tensor * t_lora = ggml_mul_mat(ctx0, + ggml_mul_mat(ctx0, loraA, loraB), + cur + ); + + if (lctx.lora_scale != 1.0f) { + t_lora = ggml_scale(ctx0, t_lora, lctx.lora_scale); + } + + ggml_tensor * t_patch = ggml_add(ctx0, mm, t_lora); + return t_patch; + +} + struct ggml_cgraph * build_phi3() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -16025,6 +16264,29 @@ void llama_free_model(struct llama_model * model) { delete model; } + +static std::map get_lora_weights_map_cpp(struct ggml_context* ctx) { + struct lora_tensor_pair* pair = build_lora_weights_map(ctx); + std::map map; + + if (pair) { + for (int i = 0; i < pair->count; i++) { + std::string name(pair->pairs[i].name); + std::string base_name = name.substr(0, name.size() - 6); + std::string suffix = name.substr(name.size() - 6); + + if (suffix == ".loraA") { + map[base_name].loraA = pair->pairs[i].tensor; + } else if (suffix == ".loraB") { + map[base_name].loraB = pair->pairs[i].tensor; + } + } + free_lora_tensor_pair(pair); + } + + return map; +} + struct llama_context * llama_new_context_with_model( struct llama_model * model, struct llama_context_params params) { @@ -16056,6 +16318,37 @@ struct llama_context * llama_new_context_with_model( llama_context * ctx = new llama_context(*model); + /// LORA + struct export_lora_params * lora_params = new struct export_lora_params; + struct lora_info lora; + lora.filename = "./models/open-llama/lora-ggml-model-q8_0-shakespeare-LATEST.bin"; + lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras + lora_params->lora.push_back(lora); + // load all loras + std::vector loras; + for (size_t i = 0; i < lora_params->lora.size(); ++i) { + struct lora_data * llora_data = load_lora(&lora_params->lora[i]); + if (llora_data != NULL) { + loras.push_back(llora_data); + } + } + if (loras.size() == 0) { + fprintf(stderr, "warning: no lora adapters will be applied.\n"); + } + // Assign data + ctx->llora_data = *loras[0]; + + // build the map? + ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx); + std::vector keys; + for (const auto& pair : ctx->lora_weights_map) { + keys.push_back(pair.first); + } + + + + /// END LORA + const auto & hparams = model->hparams; auto & cparams = ctx->cparams; diff --git a/llama.h b/llama.h index 62908261f2791..85a53f1e65819 100644 --- a/llama.h +++ b/llama.h @@ -45,6 +45,9 @@ #define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ #define LLAMA_STATE_SEQ_VERSION 1 +#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) +#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) + #ifdef __cplusplus extern "C" { #endif From 26df64ad04e377dc24427da0e178cdc67cd86e49 Mon Sep 17 00:00:00 2001 From: ltoniazzi Date: Fri, 21 Jun 2024 17:28:14 +0100 Subject: [PATCH 02/14] Fix passing param --- BRANCH_SETUP.md | 10 +++++---- common/common.cpp | 4 ++++ llama.cpp | 53 +++++++++++++++++++++++++---------------------- llama.h | 1 + 4 files changed, 39 insertions(+), 29 deletions(-) diff --git a/BRANCH_SETUP.md b/BRANCH_SETUP.md index dac58d0d2de31..d9f7405b5cab6 100644 --- a/BRANCH_SETUP.md +++ b/BRANCH_SETUP.md @@ -30,19 +30,21 @@ Run main with base model and lora adapter to hot-swap ```bash -./main ./models/open-llama/ggml-model-f16.gguf \ ---hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \ +./main -m ./models/open-llama/ggml-model-f16.gguf \ +--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \ -ngl 0 \ -n 128 ``` -With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (`lora_mul_mat`), but they are not moved to the buffer of the base tensors. +With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors. # Logic + # Current status -- Only ony Lora adapter can be passed. +- Only one Lora adapter can be passed. +- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers) - GPU not supported \ No newline at end of file diff --git a/common/common.cpp b/common/common.cpp index 494258db0ed48..21003343e4740 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2443,6 +2443,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_ubatch = params.n_ubatch; cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + const char* c_string = params.hot_lora.c_str(); + strncpy(cparams.hot_lora, c_string, sizeof(cparams.hot_lora) - 1); + cparams.hot_lora[sizeof(cparams.hot_lora) - 1] = '\0'; // Ensure null-termination + cparams.seed = params.seed; cparams.logits_all = params.logits_all; cparams.embeddings = params.embedding; diff --git a/llama.cpp b/llama.cpp index 58b6ff8640447..467ab0f29c220 100644 --- a/llama.cpp +++ b/llama.cpp @@ -145,7 +145,7 @@ struct lora_info { std::string filename; float scale; }; -// TODO lora_data should maybe sub lora_weights in llama.cpp +// TODO lora_data should maybe sub lora_weights struct lora_data { struct lora_info info; std::vector data; @@ -2502,7 +2502,7 @@ struct llama_context { llama_cparams cparams; bool lora_loaded = false; - std::map lora_weights_map; + std::map lora_weights_map; // only one LoRA adapter at the moment lora_data llora_data; float lora_scale = 1.0f; @@ -16109,6 +16109,7 @@ struct llama_context_params llama_context_default_params() { /*.n_seq_max =*/ 1, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, + /*.hot_lora =*/ "", /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED, /*.rope_freq_base =*/ 0.0f, @@ -16321,33 +16322,35 @@ struct llama_context * llama_new_context_with_model( /// LORA struct export_lora_params * lora_params = new struct export_lora_params; struct lora_info lora; - lora.filename = "./models/open-llama/lora-ggml-model-q8_0-shakespeare-LATEST.bin"; - lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras - lora_params->lora.push_back(lora); - // load all loras - std::vector loras; - for (size_t i = 0; i < lora_params->lora.size(); ++i) { - struct lora_data * llora_data = load_lora(&lora_params->lora[i]); - if (llora_data != NULL) { - loras.push_back(llora_data); + // lora.filename = "./models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin"; + lora.filename = params.hot_lora; + if (strlen(params.hot_lora) > 0) { + + lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras? + lora_params->lora.push_back(lora); + // load all loras + std::vector loras; + for (size_t i = 0; i < lora_params->lora.size(); ++i) { + struct lora_data * llora_data = load_lora(&lora_params->lora[i]); + if (llora_data != NULL) { + loras.push_back(llora_data); + } } - } - if (loras.size() == 0) { - fprintf(stderr, "warning: no lora adapters will be applied.\n"); - } - // Assign data - ctx->llora_data = *loras[0]; + if (loras.size() == 0) { + fprintf(stderr, "warning: no lora adapters will be applied.\n"); + } + // Assign data + ctx->llora_data = *loras[0]; - // build the map? - ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx); - std::vector keys; - for (const auto& pair : ctx->lora_weights_map) { - keys.push_back(pair.first); + // build the map? + ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx); + std::vector keys; + for (const auto& pair : ctx->lora_weights_map) { + keys.push_back(pair.first); + } } - - - /// END LORA + /// LORA const auto & hparams = model->hparams; auto & cparams = ctx->cparams; diff --git a/llama.h b/llama.h index 85a53f1e65819..d593eb45c9dab 100644 --- a/llama.h +++ b/llama.h @@ -292,6 +292,7 @@ extern "C" { uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing + char hot_lora[256]; // path to the hot lora file enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id From 5c4ba81933f417ce9378572513057dd4db6feebf Mon Sep 17 00:00:00 2001 From: ltoniazzi Date: Fri, 21 Jun 2024 18:00:12 +0100 Subject: [PATCH 03/14] Remove comment --- llama.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 467ab0f29c220..15e83b0c45cb2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -16322,7 +16322,6 @@ struct llama_context * llama_new_context_with_model( /// LORA struct export_lora_params * lora_params = new struct export_lora_params; struct lora_info lora; - // lora.filename = "./models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin"; lora.filename = params.hot_lora; if (strlen(params.hot_lora) > 0) { From 028d3f7c8977ef15c88966b1998ae2bcdf13f3f0 Mon Sep 17 00:00:00 2001 From: Lorenzo Toniazzi Date: Mon, 1 Jul 2024 22:16:11 +0100 Subject: [PATCH 04/14] Metal running (still buffer issues) --- BRANCH_SETUP.md | 254 ++++++++++++++++++++++++++++++++++++++++- examples/main/main.cpp | 67 +++++++++++ llama.cpp | 103 +++++++++++++++-- 3 files changed, 414 insertions(+), 10 deletions(-) diff --git a/BRANCH_SETUP.md b/BRANCH_SETUP.md index d9f7405b5cab6..0b6cdac74a45e 100644 --- a/BRANCH_SETUP.md +++ b/BRANCH_SETUP.md @@ -36,6 +36,8 @@ Run main with base model and lora adapter to hot-swap -n 128 ``` +Working but `ggml_metal_get_buffer: error: tensor 'blk.16.attn_v.weight.loraB' buffer is nil` + With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors. # Logic @@ -47,4 +49,254 @@ With `ngl > 0` the code breaks. Probably because the Lora tensors try to interac - Only one Lora adapter can be passed. - Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers) -- GPU not supported \ No newline at end of file +- GPU not supported + + + + +# Tutorial + +```cpp +#include "llama.h" + +#include "unicode.h" + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" + +#ifdef GGML_USE_RPC +# include "ggml-rpc.h" +#endif + +#ifdef GGML_USE_CUDA +# include "ggml-cuda.h" +#elif defined(GGML_USE_VULKAN) +# include "ggml-vulkan.h" +#elif defined(GGML_USE_SYCL) +# include "ggml-sycl.h" +#elif defined(GGML_USE_KOMPUTE) +# include "ggml-kompute.h" +#endif + +#ifdef GGML_USE_METAL +# include "ggml-metal.h" +#endif + +// TODO: replace with ggml API call +#define QK_K 256 + +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #include + #endif + #if defined(_POSIX_MEMLOCK_RANGE) + #include + #endif + #endif +#endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #ifndef PATH_MAX + #define PATH_MAX MAX_PATH + #endif + #include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ggml-metal.h" + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_ATTRIBUTE_FORMAT(...) +#endif + +#define LLAMA_MAX_NODES 8192 +#define LLAMA_MAX_EXPERTS 160 + + +int main() { + struct ggml_init_params params = { + .mem_size = 16*1024*1024, + .mem_buffer = NULL, + /*.no_alloc =*/ true, + }; + + // The library allows the user to define a certain function using the available tensor operations. This function + // definition is represented internally via a computation graph. Each tensor operation in the function definition + // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the + // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized + // using one of the available optimization algorithms. + // + // For example, here we define the function: f(x) = a*x^2 + b + + // memory allocation happens here + // Create context allogating memory + struct ggml_context * ctx = ggml_init(params); + + struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + + ggml_set_param(ctx, x); // x is an input variable + + struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + struct ggml_tensor * x2 = ggml_mul(ctx, x, x); + struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b); + + struct ggml_cgraph * gf = ggml_new_graph(ctx); + + // ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); + // ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend buffer"); + } + ggml_used_mem(ctx); + + // llama_default_buffer_type_offload(model, layer_gpu); used in llama.cpp + // How to check which buffer is the context allocated, + // can look at single tensors? option, check in inited in base model + + // Try this + // You can simplify all of this for testing, and if you are using CPU only, and just run with -ngl 0 + // and allocate everything in a CPU buffer by using + // ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); + // or run with -ngl 99 and use a Metal buffer type instead with + // ggml_backend_metal_buffer_type() + // It will still run if you allocate the tensors in the wrong buffer type as long as you use ggml-backend + // to allocate the tensors, it will just be slower. + + // Notice that the function definition above does not involve any actual computation. The computation is performed only + // when the user explicitly requests it. For example, to compute the function's value at x = 2.0: + + + ggml_build_forward_expand(gf, f); + + // set the input variable and parameter values + ggml_set_f32(x, 2.0f); + ggml_set_f32(a, 3.0f); + ggml_set_f32(b, 4.0f); + + ggml_graph_compute_with_ctx(ctx, gf, 1); + + printf("f = %f\n", ggml_get_f32_1d(f, 0)); + + // The actual computation is performed in the ggml_graph_compute() function. + // + // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the + // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know + // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory + // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was + // actually needed. + // + // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic + // differentiation and optimization algorithms. + // + // The described approach allows to define the function graph once and then compute its forward or backward graphs + // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way + // the user can avoid the memory allocation overhead at runtime. + // + // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class + // citizens, but in theory the library can be extended to support FP8 and integer data types. + // + // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary + // and binary operations. Most of the available operations fall into one of these two categories. With time, it became + // clear that the library needs to support more complex operations. The way to support these operations is not clear + // yet, but a few examples are demonstrated in the following operations: + // + // - ggml_permute() + // - ggml_conv_1d_1s() + // - ggml_conv_1d_2s() + // + // For each tensor operator, the library implements a forward and backward computation function. The forward function + // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the + // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a + // calculus class, or watch the following video: + // + // What is Automatic Differentiation? + // https://www.youtube.com/watch?v=wG_nF1awSSY + + // ## Tensor data (struct ggml_tensor) + // + // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of + // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains + // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example: + + struct ggml_tensor * c = ggml_add(ctx, a, b); + + assert(c->src[0] == a); + assert(c->src[1] == b); + + // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the + // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows + // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and + // permutation. All tensor operations have to take the stride into account and not assume that the tensor is + // contiguous in memory. + + // The data of the tensor is accessed via the "data" pointer. For example: + + const int nx = 2; + const int ny = 3; + + struct ggml_tensor * A = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny); + + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + *(float *) ((char *) A->data + y*A->nb[1] + x*A->nb[0]) = x + y; + } + } + + // + // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used. + // + + } + ``` \ No newline at end of file diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b97b7b7937f02..bdcf6f998c2d9 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -117,7 +117,74 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v LOG_TEE("%s", text); } +#include "ggml-metal.h" + +bool is_pointer_in_buffer_range(void *ptr, void *buffer_start, size_t buffer_size) { + return (ptr >= (char*)buffer_start) && (ptr < ((char*)buffer_start + buffer_size)); +} + + +void verify_tensor_allocation(struct ggml_context * ctx, ggml_backend_buffer_t buffer, size_t buffer_size) { + struct ggml_tensor * first = ggml_get_first_tensor(ctx); + for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (t->data != NULL) { + if (!is_pointer_in_buffer_range(t->data, buffer, buffer_size)) { + fprintf(stderr, "Tensor %s is not within the allocated buffer range.\n", t->name); + } else { + printf("Tensor %s is correctly allocated in the buffer.\n", t->name); + } + } + } +} + int main(int argc, char ** argv) { + + + // The library allows the user to define a certain function using the available tensor operations. This function + // definition is represented internally via a computation graph. Each tensor operation in the function definition + // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the + // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized + // using one of the available optimization algorithms. + // + // For example, here we define the function: f(x) = a*x^2 + b + + // memory allocation happens here + // Create context allogating memory + struct ggml_init_params _params = { + .mem_size = 16*1024*1024, + .mem_buffer = NULL, + .no_alloc = true, + }; + struct ggml_context * _ctx = ggml_init(_params); + + struct ggml_tensor * x = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1); + + // ggml_set_param(_ctx, x); // x is an input variable + + // struct ggml_tensor * a = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1); + // struct ggml_tensor * b = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1); + // struct ggml_tensor * x2 = ggml_mul(_ctx, x, x); + // struct ggml_tensor * f = ggml_add(_ctx, ggml_mul(_ctx, a, x2), b); + + // struct ggml_cgraph * gf = ggml_new_graph(_ctx); + + // // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_cpu_buffer_type()); + // // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type()); + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type()); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend buffer"); + } + else { + size_t buffer_size = ggml_backend_buft_get_max_size(ggml_backend_metal_buffer_type()); + + // Verify tensor allocations + verify_tensor_allocation(_ctx, buf, buffer_size); + } + ggml_used_mem(_ctx); + // + + + gpt_params params; g_params = ¶ms; diff --git a/llama.cpp b/llama.cpp index 15e83b0c45cb2..744e4f8c3fb64 100644 --- a/llama.cpp +++ b/llama.cpp @@ -307,6 +307,11 @@ static struct lora_data * load_lora(struct lora_info * info) { tensors_offset.push_back(offset); file.seek(nbytes, SEEK_CUR); } + + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_metal_buffer_type()); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__); + } // read tensor data result->data.resize(total_nbytes_pad); size_t data_offset = 0; @@ -3922,7 +3927,7 @@ struct llama_model_loader { std::vector> read_buf; std::vector>> validation_result; - + // Allocate tensors data to buffer for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { const auto * weight = get_weight(ggml_get_name(cur)); if (weight == nullptr) { @@ -3951,7 +3956,7 @@ struct llama_model_loader { return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); })); } - + // TODO LORA allocation of base tensors GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated if (buf_mmap && cur->data == nullptr) { ggml_backend_tensor_alloc(buf_mmap, cur, data); @@ -5392,7 +5397,7 @@ static bool llm_load_tensors( auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); }; model.layers.resize(n_layer); - + // main players model, ml, ctx_input/output, tn (gets name?) const auto tn = LLM_TN(model.arch); switch (model.arch) { case LLM_ARCH_LLAMA: @@ -6666,7 +6671,7 @@ static bool llm_load_tensors( #endif } } -#ifdef GGML_USE_METAL +#ifdef GGML_USE_METAL // LORA Use metal on base tensors else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) { for (uint32_t idx = 0; idx < ml.files.size(); idx++) { const size_t max_size = ggml_get_max_tensor_size(ctx); @@ -16341,12 +16346,92 @@ struct llama_context * llama_new_context_with_model( // Assign data ctx->llora_data = *loras[0]; - // build the map? - ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx); - std::vector keys; - for (const auto& pair : ctx->lora_weights_map) { - keys.push_back(pair.first); + + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx, ggml_backend_metal_buffer_type()); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__); } + // Looks this worked, need to check if tensors have new buffer (not sure below). + // Also do we need to set the tensors? not clear where data is, looks like it is loaded after the + // tensor creation in context, but loaded where? cuz if data present dfferebt way to set with ggml_backend_tensor_set instead of ggml_backend_tensor_alloc + + // TODO looks like I have already a context with load_lora, understand if + // I am using it + // If the contexg it set to right buffer with ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx, ggml_backend_metal_buffer_type()); + // As I should already have created the tensors in the context, + // Understand where are the weights loaded instead + // Load the weight/data in the context + // Maybe check finetuning approach at managing the lora weights. + + + + // build the map? TODO LORA ctx->lora_weights_map layers seem to not have buffer type but it should as the simple example does + ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx); + // std::vector keys; + // for (const auto& pair : ctx->lora_weights_map) { + // keys.push_back(pair.first); + + // ggml_tensor * tensorA = pair.second.loraA; + // ggml_tensor * tensorB = pair.second.loraB; + + // ggml_tensor * tensorA_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorA->type, 4, tensorA->ne); + // ggml_tensor * tensorB_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorB->type, 4, tensorB->ne); + + // } + + // for (struct ggml_tensor * cur = ggml_get_first_tensor((ctx->llora_data).ctx); cur != NULL; cur = ggml_get_next_tensor((ctx->llora_data).ctx, cur)) { + // const auto * name = ggml_get_name(cur); + // // ggml_backend_tensor_set(tensorA, tensorA->data, 0, ggml_nbytes(tensorA)); + // // ggml_backend_tensor_set(tensorB, tensorB->data, 0, ggml_nbytes(tensorB)); + + // } + + // for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { + // const auto * weight = get_weight(ggml_get_name(cur)); + // if (weight == nullptr) { + // // this can happen with split experts models + // continue; + // } + + // if (progress_callback) { + // if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { + // return false; + // } + // } + + // size_t n_size = ggml_nbytes(cur); + + // if (use_mmap) { + // const auto & mapping = mappings.at(weight->idx); + // ggml_backend_buffer_t buf_mmap = nullptr; + // if (bufs_mmap.count(weight->idx)) { + // buf_mmap = bufs_mmap.at(weight->idx); + // } + // uint8_t * data = (uint8_t *) mapping->addr + weight->offs; + + // if (check_tensors) { + // validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { + // return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); + // })); + // } + // // TODO LORA allocation of base tensors + // GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated + // if (buf_mmap && cur->data == nullptr) { + // ggml_backend_tensor_alloc(buf_mmap, cur, data); + // if (lmlocks) { + // const auto & lmlock = lmlocks->at(weight->idx); + // lmlock->grow_to(weight->offs + n_size); + // } + + // auto & mmap_used = mmaps_used[weight->idx]; + // mmap_used.first = std::min(mmap_used.first, weight->offs); + // mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); + // } else { + // ggml_backend_tensor_set(cur, data, 0, n_size); + + + + } /// LORA From 1103bdb57476b65404221f87b37ed2f91ffd4492 Mon Sep 17 00:00:00 2001 From: Lorenzo Toniazzi Date: Tue, 2 Jul 2024 21:59:54 +0100 Subject: [PATCH 05/14] Fixed buffer allocation --- examples/main/main.cpp | 46 ------------------------------------------ llama.cpp | 30 ++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 51 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index bdcf6f998c2d9..5e9e4001de1b5 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -139,52 +139,6 @@ void verify_tensor_allocation(struct ggml_context * ctx, ggml_backend_buffer_t b int main(int argc, char ** argv) { - - // The library allows the user to define a certain function using the available tensor operations. This function - // definition is represented internally via a computation graph. Each tensor operation in the function definition - // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the - // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized - // using one of the available optimization algorithms. - // - // For example, here we define the function: f(x) = a*x^2 + b - - // memory allocation happens here - // Create context allogating memory - struct ggml_init_params _params = { - .mem_size = 16*1024*1024, - .mem_buffer = NULL, - .no_alloc = true, - }; - struct ggml_context * _ctx = ggml_init(_params); - - struct ggml_tensor * x = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1); - - // ggml_set_param(_ctx, x); // x is an input variable - - // struct ggml_tensor * a = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1); - // struct ggml_tensor * b = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1); - // struct ggml_tensor * x2 = ggml_mul(_ctx, x, x); - // struct ggml_tensor * f = ggml_add(_ctx, ggml_mul(_ctx, a, x2), b); - - // struct ggml_cgraph * gf = ggml_new_graph(_ctx); - - // // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_cpu_buffer_type()); - // // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type()); - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type()); - if (buf == nullptr) { - throw std::runtime_error("unable to allocate backend buffer"); - } - else { - size_t buffer_size = ggml_backend_buft_get_max_size(ggml_backend_metal_buffer_type()); - - // Verify tensor allocations - verify_tensor_allocation(_ctx, buf, buffer_size); - } - ggml_used_mem(_ctx); - // - - - gpt_params params; g_params = ¶ms; diff --git a/llama.cpp b/llama.cpp index 744e4f8c3fb64..cd4b43e945e8e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -150,6 +150,11 @@ struct lora_data { struct lora_info info; std::vector data; struct ggml_context * ctx; + // the backend to perform the computation (CPU, CUDA, METAL) + ggml_backend_t backend = NULL; + + // the backend buffer to storage the tensors data of a and b + ggml_backend_buffer_t buffer; uint32_t lora_r; uint32_t lora_alpha; @@ -253,9 +258,17 @@ static struct lora_data * load_lora(struct lora_info * info) { struct lora_data * result = new struct lora_data; result->info = *info; result->ctx = NULL; + result->backend = NULL; + result->buffer = NULL; result->lora_r = 1; result->lora_alpha = 1; + fprintf(stderr, "%s: using Metal backend\n", __func__); + result->backend = ggml_backend_metal_init(); + if (!result->backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } + struct llama_file_lora file(info->filename.c_str(), "rb"); if (file.fp == NULL) { fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n", @@ -307,9 +320,10 @@ static struct lora_data * load_lora(struct lora_info * info) { tensors_offset.push_back(offset); file.seek(nbytes, SEEK_CUR); } + result->buffer = ggml_backend_alloc_ctx_tensors(result->ctx, result->backend); - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_metal_buffer_type()); - if (!buf) { + // ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_metal_buffer_type()); + if (!result->buffer) { LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__); } // read tensor data @@ -321,9 +335,15 @@ static struct lora_data * load_lora(struct lora_info * info) { size_t nbytes = ggml_nbytes(tensor); size_t nbytes_pad = ggml_nbytes_pad(tensor); file.seek(offset, SEEK_SET); - tensor->data = result->data.data() + data_offset; - file.read_raw(tensor->data, nbytes); - data_offset += nbytes_pad; + + std::vector read_buf; + read_buf.resize(ggml_nbytes(tensor)); + file.read_raw(read_buf.data(), ggml_nbytes(tensor)); + ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor)); + // tensor_tmp->data = result->data.data() + data_offset; + // file.read_raw(tensor_tmp->data, nbytes); + // data_offset += nbytes_pad; + // ggml_backend_tensor_set(tensor, tensor_tmp->data, 0, ggml_nbytes(tensor)); } return result; } From 1734f3f0f842791abeaeb391e1324b50be120eb9 Mon Sep 17 00:00:00 2001 From: Lorenzo Toniazzi Date: Tue, 2 Jul 2024 22:18:35 +0100 Subject: [PATCH 06/14] Clean up --- examples/main/main.cpp | 19 ------- llama.cpp | 115 +++++++++-------------------------------- 2 files changed, 25 insertions(+), 109 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 5e9e4001de1b5..ba76a496b5999 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -117,25 +117,6 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v LOG_TEE("%s", text); } -#include "ggml-metal.h" - -bool is_pointer_in_buffer_range(void *ptr, void *buffer_start, size_t buffer_size) { - return (ptr >= (char*)buffer_start) && (ptr < ((char*)buffer_start + buffer_size)); -} - - -void verify_tensor_allocation(struct ggml_context * ctx, ggml_backend_buffer_t buffer, size_t buffer_size) { - struct ggml_tensor * first = ggml_get_first_tensor(ctx); - for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) { - if (t->data != NULL) { - if (!is_pointer_in_buffer_range(t->data, buffer, buffer_size)) { - fprintf(stderr, "Tensor %s is not within the allocated buffer range.\n", t->name); - } else { - printf("Tensor %s is correctly allocated in the buffer.\n", t->name); - } - } - } -} int main(int argc, char ** argv) { diff --git a/llama.cpp b/llama.cpp index cd4b43e945e8e..fe842e4d82b58 100644 --- a/llama.cpp +++ b/llama.cpp @@ -263,11 +263,27 @@ static struct lora_data * load_lora(struct lora_info * info) { result->lora_r = 1; result->lora_alpha = 1; +#ifdef GGML_USE_CUDA + fprintf(stderr, "%s: using CUDA backend\n", __func__); + result->backend = ggml_backend_cuda_init(0); // init device 0 + if (!result->backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } +#endif + +#ifdef GGML_USE_METAL fprintf(stderr, "%s: using Metal backend\n", __func__); result->backend = ggml_backend_metal_init(); if (!result->backend) { fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); } +#endif + + // if there aren't GPU Backends fallback to CPU backend + if (!result->backend) { + result->backend = ggml_backend_cpu_init(); + } + struct llama_file_lora file(info->filename.c_str(), "rb"); if (file.fp == NULL) { @@ -320,30 +336,24 @@ static struct lora_data * load_lora(struct lora_info * info) { tensors_offset.push_back(offset); file.seek(nbytes, SEEK_CUR); } - result->buffer = ggml_backend_alloc_ctx_tensors(result->ctx, result->backend); - // ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_metal_buffer_type()); + + + result->buffer = ggml_backend_alloc_ctx_tensors(result->ctx, result->backend); if (!result->buffer) { LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__); } // read tensor data result->data.resize(total_nbytes_pad); - size_t data_offset = 0; for (size_t i = 0; i < tensors.size(); ++i) { struct ggml_tensor * tensor = tensors[i]; size_t offset = tensors_offset[i]; size_t nbytes = ggml_nbytes(tensor); - size_t nbytes_pad = ggml_nbytes_pad(tensor); file.seek(offset, SEEK_SET); - std::vector read_buf; - read_buf.resize(ggml_nbytes(tensor)); - file.read_raw(read_buf.data(), ggml_nbytes(tensor)); - ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor)); - // tensor_tmp->data = result->data.data() + data_offset; - // file.read_raw(tensor_tmp->data, nbytes); - // data_offset += nbytes_pad; - // ggml_backend_tensor_set(tensor, tensor_tmp->data, 0, ggml_nbytes(tensor)); + read_buf.resize(nbytes); + file.read_raw(read_buf.data(), nbytes); + ggml_backend_tensor_set(tensor, read_buf.data(), 0, nbytes); } return result; } @@ -16344,7 +16354,7 @@ struct llama_context * llama_new_context_with_model( llama_context * ctx = new llama_context(*model); - /// LORA + /// LORA load start struct export_lora_params * lora_params = new struct export_lora_params; struct lora_info lora; lora.filename = params.hot_lora; @@ -16365,27 +16375,6 @@ struct llama_context * llama_new_context_with_model( } // Assign data ctx->llora_data = *loras[0]; - - - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx, ggml_backend_metal_buffer_type()); - if (!buf) { - LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__); - } - // Looks this worked, need to check if tensors have new buffer (not sure below). - // Also do we need to set the tensors? not clear where data is, looks like it is loaded after the - // tensor creation in context, but loaded where? cuz if data present dfferebt way to set with ggml_backend_tensor_set instead of ggml_backend_tensor_alloc - - // TODO looks like I have already a context with load_lora, understand if - // I am using it - // If the contexg it set to right buffer with ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx, ggml_backend_metal_buffer_type()); - // As I should already have created the tensors in the context, - // Understand where are the weights loaded instead - // Load the weight/data in the context - // Maybe check finetuning approach at managing the lora weights. - - - - // build the map? TODO LORA ctx->lora_weights_map layers seem to not have buffer type but it should as the simple example does ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx); // std::vector keys; // for (const auto& pair : ctx->lora_weights_map) { @@ -16398,63 +16387,9 @@ struct llama_context * llama_new_context_with_model( // ggml_tensor * tensorB_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorB->type, 4, tensorB->ne); // } - - // for (struct ggml_tensor * cur = ggml_get_first_tensor((ctx->llora_data).ctx); cur != NULL; cur = ggml_get_next_tensor((ctx->llora_data).ctx, cur)) { - // const auto * name = ggml_get_name(cur); - // // ggml_backend_tensor_set(tensorA, tensorA->data, 0, ggml_nbytes(tensorA)); - // // ggml_backend_tensor_set(tensorB, tensorB->data, 0, ggml_nbytes(tensorB)); + } - // } - - // for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { - // const auto * weight = get_weight(ggml_get_name(cur)); - // if (weight == nullptr) { - // // this can happen with split experts models - // continue; - // } - - // if (progress_callback) { - // if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { - // return false; - // } - // } - - // size_t n_size = ggml_nbytes(cur); - - // if (use_mmap) { - // const auto & mapping = mappings.at(weight->idx); - // ggml_backend_buffer_t buf_mmap = nullptr; - // if (bufs_mmap.count(weight->idx)) { - // buf_mmap = bufs_mmap.at(weight->idx); - // } - // uint8_t * data = (uint8_t *) mapping->addr + weight->offs; - - // if (check_tensors) { - // validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { - // return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); - // })); - // } - // // TODO LORA allocation of base tensors - // GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated - // if (buf_mmap && cur->data == nullptr) { - // ggml_backend_tensor_alloc(buf_mmap, cur, data); - // if (lmlocks) { - // const auto & lmlock = lmlocks->at(weight->idx); - // lmlock->grow_to(weight->offs + n_size); - // } - - // auto & mmap_used = mmaps_used[weight->idx]; - // mmap_used.first = std::min(mmap_used.first, weight->offs); - // mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); - // } else { - // ggml_backend_tensor_set(cur, data, 0, n_size); - - - - - } - - /// LORA + /// LORA load end const auto & hparams = model->hparams; auto & cparams = ctx->cparams; From 284e665a4bf209fa583f805aae9d12c9e14979df Mon Sep 17 00:00:00 2001 From: Lorenzo Toniazzi Date: Tue, 2 Jul 2024 22:29:49 +0100 Subject: [PATCH 07/14] Clean up --- llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index fe842e4d82b58..eeca784b9e777 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3986,7 +3986,7 @@ struct llama_model_loader { return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); })); } - // TODO LORA allocation of base tensors + GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated if (buf_mmap && cur->data == nullptr) { ggml_backend_tensor_alloc(buf_mmap, cur, data); @@ -5427,7 +5427,7 @@ static bool llm_load_tensors( auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); }; model.layers.resize(n_layer); - // main players model, ml, ctx_input/output, tn (gets name?) + const auto tn = LLM_TN(model.arch); switch (model.arch) { case LLM_ARCH_LLAMA: @@ -6701,7 +6701,7 @@ static bool llm_load_tensors( #endif } } -#ifdef GGML_USE_METAL // LORA Use metal on base tensors +#ifdef GGML_USE_METAL else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) { for (uint32_t idx = 0; idx < ml.files.size(); idx++) { const size_t max_size = ggml_get_max_tensor_size(ctx); From 8f0272c9d716a8938b214c6ac6d68533ab8066af Mon Sep 17 00:00:00 2001 From: Lorenzo Toniazzi Date: Sat, 6 Jul 2024 21:19:52 +0100 Subject: [PATCH 08/14] update branch notes --- BRANCH_SETUP.md => _BRANCH_SETUP.md | 55 +++++++++++++++++++++++++---- llama.cpp | 7 ++-- 2 files changed, 54 insertions(+), 8 deletions(-) rename BRANCH_SETUP.md => _BRANCH_SETUP.md (89%) diff --git a/BRANCH_SETUP.md b/_BRANCH_SETUP.md similarity index 89% rename from BRANCH_SETUP.md rename to _BRANCH_SETUP.md index 0b6cdac74a45e..b2d5ab6af6d59 100644 --- a/BRANCH_SETUP.md +++ b/_BRANCH_SETUP.md @@ -32,13 +32,14 @@ Run main with base model and lora adapter to hot-swap ```bash ./main -m ./models/open-llama/ggml-model-f16.gguf \ --hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \ --ngl 0 \ +-ngl 99 \ +-n 128 +``` +```bash +./main -m ./models/open-llama/ggml-model-f16.gguf \ +-ngl 99 \ -n 128 ``` - -Working but `ggml_metal_get_buffer: error: tensor 'blk.16.attn_v.weight.loraB' buffer is nil` - -With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors. # Logic @@ -299,4 +300,46 @@ int main() { // } - ``` \ No newline at end of file + ``` + + + + ```bash + # Convert base model to gguf + python3 convert-hf-to-gguf.py models/open-llama/ && \ + # Quantize base model + ./quantize ./models/open-llama/ggml-model-f16.gguf ./models/open-llama/ggml-model-q4.gguf Q4_K && \ + # Obtain Lora adapter + ./finetune --model-base models/open-llama/ggml-model-q4.gguf \ + --checkpoint-in models/open-llama/chk-lora-ggml-model-q4-hot-lora-LATEST.gguf \ + --checkpoint-out models/open-llama/chk-lora-ggml-model-q4-hot-lora-ITERATION.gguf \ + --lora-out models/open-llama/lora-ggml-model-q4-hot-lora-ITERATION.bin \ + --train-data "data/hot-lora.txt" \ + --save-every 1 \ + --threads 1 \ + --adam-iter 1 \ + --batch 1 \ + --ctx 16 \ + --use-checkpointing + ``` + + + +## 1. Run main with adapter + +- Run main with base model and lora adapter to hot-swap + ```bash + ./main -m ./models/open-llama/ggml-model-q4.gguf \ + --hot-lora models/open-llama/lora-ggml-model-q4-hot-lora-LATEST.bin \ + -ngl 99 \ + -n 128 + ``` + +- Do not pass the flag `--hot-lora` and the adapter is ignored: + ```bash + ./main -m ./models/open-llama/ggml-model-q4.gguf \ + -ngl 99 \ + -n 128 + ``` + + make clean && make -j 8 LLAMA_DEBUG=1 \ No newline at end of file diff --git a/llama.cpp b/llama.cpp index eeca784b9e777..df098b652ba6b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9731,8 +9731,11 @@ struct llm_build_context { ggml_tensor * loraB = it->second.loraB; ggml_tensor * t_lora = ggml_mul_mat(ctx0, - ggml_mul_mat(ctx0, loraA, loraB), - cur + loraA, + ggml_mul_mat(ctx0, + ggml_transpose(ctx0, loraB), + cur + ) ); if (lctx.lora_scale != 1.0f) { From 798cde72a187e35f8a72ce68bddd8c518622ce76 Mon Sep 17 00:00:00 2001 From: Lorenzo Toniazzi Date: Sat, 6 Jul 2024 21:40:22 +0100 Subject: [PATCH 09/14] transpose and run cont --- _BRANCH_SETUP.md | 5 ++++- llama.cpp | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/_BRANCH_SETUP.md b/_BRANCH_SETUP.md index b2d5ab6af6d59..7cdb8ae6a67e2 100644 --- a/_BRANCH_SETUP.md +++ b/_BRANCH_SETUP.md @@ -342,4 +342,7 @@ int main() { -n 128 ``` - make clean && make -j 8 LLAMA_DEBUG=1 \ No newline at end of file +build for debug: +```bash + make clean && make -j 8 LLAMA_DEBUG=1 +``` \ No newline at end of file diff --git a/llama.cpp b/llama.cpp index df098b652ba6b..1f3e127fbe408 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9731,9 +9731,9 @@ struct llm_build_context { ggml_tensor * loraB = it->second.loraB; ggml_tensor * t_lora = ggml_mul_mat(ctx0, - loraA, + loraB, ggml_mul_mat(ctx0, - ggml_transpose(ctx0, loraB), + ggml_cont(ctx0, ggml_transpose(ctx0, loraA)), cur ) ); From 931134b536d6e79c463d10b5da6f39d9d0891214 Mon Sep 17 00:00:00 2001 From: Lorenzo Toniazzi Date: Sat, 6 Jul 2024 22:59:15 +0100 Subject: [PATCH 10/14] transpose when loading --- llama.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 1f3e127fbe408..31baf6c6cccca 100644 --- a/llama.cpp +++ b/llama.cpp @@ -328,6 +328,12 @@ static struct lora_data * load_lora(struct lora_info * info) { file.seek((0-file.tell()) & 31, SEEK_CUR); size_t offset = file.tell(); struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne); + // Transpose lora matrix A + if (std::string(name_buf.data()).find("loraA") != std::string::npos) { + tensor = ggml_cont(result->ctx, + ggml_transpose(result->ctx, tensor) + ); + } ggml_set_name(tensor, name_buf.data()); size_t nbytes = ggml_nbytes(tensor); size_t nbytes_pad = ggml_nbytes_pad(tensor); @@ -9732,10 +9738,7 @@ struct llm_build_context { ggml_tensor * t_lora = ggml_mul_mat(ctx0, loraB, - ggml_mul_mat(ctx0, - ggml_cont(ctx0, ggml_transpose(ctx0, loraA)), - cur - ) + ggml_mul_mat(ctx0, loraA, cur) ); if (lctx.lora_scale != 1.0f) { From 41e8c733f6fd3c1d1750b50ef28d66090394d934 Mon Sep 17 00:00:00 2001 From: Lorenzo Toniazzi Date: Sun, 7 Jul 2024 10:32:53 +0100 Subject: [PATCH 11/14] Transpose after setting data --- llama.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index 31baf6c6cccca..adfbb4828ccb3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -328,12 +328,6 @@ static struct lora_data * load_lora(struct lora_info * info) { file.seek((0-file.tell()) & 31, SEEK_CUR); size_t offset = file.tell(); struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne); - // Transpose lora matrix A - if (std::string(name_buf.data()).find("loraA") != std::string::npos) { - tensor = ggml_cont(result->ctx, - ggml_transpose(result->ctx, tensor) - ); - } ggml_set_name(tensor, name_buf.data()); size_t nbytes = ggml_nbytes(tensor); size_t nbytes_pad = ggml_nbytes_pad(tensor); @@ -360,6 +354,14 @@ static struct lora_data * load_lora(struct lora_info * info) { read_buf.resize(nbytes); file.read_raw(read_buf.data(), nbytes); ggml_backend_tensor_set(tensor, read_buf.data(), 0, nbytes); + // Transpose lora matrix A + std::string original_name(tensor->name); + if (std::string(tensor->name).find(".loraA") != std::string::npos) { + tensor = ggml_cont(result->ctx, + ggml_transpose(result->ctx, tensor) + ); + ggml_set_name(tensor, original_name.c_str()); + } } return result; } From 6597a72c1d09544861803ebb5d1fd6265066fe0d Mon Sep 17 00:00:00 2001 From: Lorenzo Toniazzi Date: Sun, 7 Jul 2024 22:09:30 +0100 Subject: [PATCH 12/14] Remove files --- _BRANCH_SETUP.md | 348 ----------------------------------------- data/hot-lora.txt | 2 +- examples/main/main.cpp | 2 - llama.cpp | 21 +-- 4 files changed, 5 insertions(+), 368 deletions(-) delete mode 100644 _BRANCH_SETUP.md diff --git a/_BRANCH_SETUP.md b/_BRANCH_SETUP.md deleted file mode 100644 index 7cdb8ae6a67e2..0000000000000 --- a/_BRANCH_SETUP.md +++ /dev/null @@ -1,348 +0,0 @@ -# Setup this branch - -## Create a lora adpter bin file - -0. `mkdir models/open-llama` and download [Open-llama (all files)](https://huggingface.co/openlm-research/open_llama_3b_v2/tree/main) in the folder `./models/open-llama` - -2. `mkdir data && touch data/hot-lora.txt` and write a couple of words in it. - -3. Run: - ```bash - # Convert base model to gguf - python3 convert-hf-to-gguf.py models/open-llama/ - # Quantize base model - ./quantize ./models/open-llama/ggml-model-f16.gguf ./models/open-llama/ggml-model-q8_0.gguf Q8_0 - # Obtain Lora adapter - ./finetune --model-base models/open-llama/ggml-model-q8_0.gguf \ - --checkpoint-in models/open-llama/chk-lora-ggml-model-q8_0-hot-lora-LATEST.gguf \ - --checkpoint-out models/open-llama/chk-lora-ggml-model-q8_0-hot-lora-ITERATION.gguf \ - --lora-out models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \ - --train-data "data/hot-lora.txt" \ - --save-every 1 \ - --threads 1 \ - --adam-iter 1 \ - --batch 1 \ - --ctx 16 \ - --use-checkpointing - ``` - -## Run main with adapter - -Run main with base model and lora adapter to hot-swap -```bash -./main -m ./models/open-llama/ggml-model-f16.gguf \ ---hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \ --ngl 99 \ --n 128 -``` -```bash -./main -m ./models/open-llama/ggml-model-f16.gguf \ --ngl 99 \ --n 128 -``` - -# Logic - - - - -# Current status - -- Only one Lora adapter can be passed. -- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers) -- GPU not supported - - - - -# Tutorial - -```cpp -#include "llama.h" - -#include "unicode.h" - -#include "ggml.h" -#include "ggml-alloc.h" -#include "ggml-backend.h" - -#ifdef GGML_USE_RPC -# include "ggml-rpc.h" -#endif - -#ifdef GGML_USE_CUDA -# include "ggml-cuda.h" -#elif defined(GGML_USE_VULKAN) -# include "ggml-vulkan.h" -#elif defined(GGML_USE_SYCL) -# include "ggml-sycl.h" -#elif defined(GGML_USE_KOMPUTE) -# include "ggml-kompute.h" -#endif - -#ifdef GGML_USE_METAL -# include "ggml-metal.h" -#endif - -// TODO: replace with ggml API call -#define QK_K 256 - -#ifdef __has_include - #if __has_include() - #include - #if defined(_POSIX_MAPPED_FILES) - #include - #include - #endif - #if defined(_POSIX_MEMLOCK_RANGE) - #include - #endif - #endif -#endif - -#if defined(_WIN32) - #define WIN32_LEAN_AND_MEAN - #ifndef NOMINMAX - #define NOMINMAX - #endif - #include - #ifndef PATH_MAX - #define PATH_MAX MAX_PATH - #endif - #include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "ggml-metal.h" - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_ATTRIBUTE_FORMAT(...) -#endif - -#define LLAMA_MAX_NODES 8192 -#define LLAMA_MAX_EXPERTS 160 - - -int main() { - struct ggml_init_params params = { - .mem_size = 16*1024*1024, - .mem_buffer = NULL, - /*.no_alloc =*/ true, - }; - - // The library allows the user to define a certain function using the available tensor operations. This function - // definition is represented internally via a computation graph. Each tensor operation in the function definition - // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the - // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized - // using one of the available optimization algorithms. - // - // For example, here we define the function: f(x) = a*x^2 + b - - // memory allocation happens here - // Create context allogating memory - struct ggml_context * ctx = ggml_init(params); - - struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - - ggml_set_param(ctx, x); // x is an input variable - - struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - struct ggml_tensor * x2 = ggml_mul(ctx, x, x); - struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b); - - struct ggml_cgraph * gf = ggml_new_graph(ctx); - - // ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); - // ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); - if (buf == nullptr) { - throw std::runtime_error("unable to allocate backend buffer"); - } - ggml_used_mem(ctx); - - // llama_default_buffer_type_offload(model, layer_gpu); used in llama.cpp - // How to check which buffer is the context allocated, - // can look at single tensors? option, check in inited in base model - - // Try this - // You can simplify all of this for testing, and if you are using CPU only, and just run with -ngl 0 - // and allocate everything in a CPU buffer by using - // ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); - // or run with -ngl 99 and use a Metal buffer type instead with - // ggml_backend_metal_buffer_type() - // It will still run if you allocate the tensors in the wrong buffer type as long as you use ggml-backend - // to allocate the tensors, it will just be slower. - - // Notice that the function definition above does not involve any actual computation. The computation is performed only - // when the user explicitly requests it. For example, to compute the function's value at x = 2.0: - - - ggml_build_forward_expand(gf, f); - - // set the input variable and parameter values - ggml_set_f32(x, 2.0f); - ggml_set_f32(a, 3.0f); - ggml_set_f32(b, 4.0f); - - ggml_graph_compute_with_ctx(ctx, gf, 1); - - printf("f = %f\n", ggml_get_f32_1d(f, 0)); - - // The actual computation is performed in the ggml_graph_compute() function. - // - // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the - // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know - // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory - // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was - // actually needed. - // - // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic - // differentiation and optimization algorithms. - // - // The described approach allows to define the function graph once and then compute its forward or backward graphs - // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way - // the user can avoid the memory allocation overhead at runtime. - // - // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class - // citizens, but in theory the library can be extended to support FP8 and integer data types. - // - // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary - // and binary operations. Most of the available operations fall into one of these two categories. With time, it became - // clear that the library needs to support more complex operations. The way to support these operations is not clear - // yet, but a few examples are demonstrated in the following operations: - // - // - ggml_permute() - // - ggml_conv_1d_1s() - // - ggml_conv_1d_2s() - // - // For each tensor operator, the library implements a forward and backward computation function. The forward function - // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the - // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a - // calculus class, or watch the following video: - // - // What is Automatic Differentiation? - // https://www.youtube.com/watch?v=wG_nF1awSSY - - // ## Tensor data (struct ggml_tensor) - // - // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of - // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains - // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example: - - struct ggml_tensor * c = ggml_add(ctx, a, b); - - assert(c->src[0] == a); - assert(c->src[1] == b); - - // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the - // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows - // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and - // permutation. All tensor operations have to take the stride into account and not assume that the tensor is - // contiguous in memory. - - // The data of the tensor is accessed via the "data" pointer. For example: - - const int nx = 2; - const int ny = 3; - - struct ggml_tensor * A = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny); - - for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - *(float *) ((char *) A->data + y*A->nb[1] + x*A->nb[0]) = x + y; - } - } - - // - // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used. - // - - } - ``` - - - - ```bash - # Convert base model to gguf - python3 convert-hf-to-gguf.py models/open-llama/ && \ - # Quantize base model - ./quantize ./models/open-llama/ggml-model-f16.gguf ./models/open-llama/ggml-model-q4.gguf Q4_K && \ - # Obtain Lora adapter - ./finetune --model-base models/open-llama/ggml-model-q4.gguf \ - --checkpoint-in models/open-llama/chk-lora-ggml-model-q4-hot-lora-LATEST.gguf \ - --checkpoint-out models/open-llama/chk-lora-ggml-model-q4-hot-lora-ITERATION.gguf \ - --lora-out models/open-llama/lora-ggml-model-q4-hot-lora-ITERATION.bin \ - --train-data "data/hot-lora.txt" \ - --save-every 1 \ - --threads 1 \ - --adam-iter 1 \ - --batch 1 \ - --ctx 16 \ - --use-checkpointing - ``` - - - -## 1. Run main with adapter - -- Run main with base model and lora adapter to hot-swap - ```bash - ./main -m ./models/open-llama/ggml-model-q4.gguf \ - --hot-lora models/open-llama/lora-ggml-model-q4-hot-lora-LATEST.bin \ - -ngl 99 \ - -n 128 - ``` - -- Do not pass the flag `--hot-lora` and the adapter is ignored: - ```bash - ./main -m ./models/open-llama/ggml-model-q4.gguf \ - -ngl 99 \ - -n 128 - ``` - -build for debug: -```bash - make clean && make -j 8 LLAMA_DEBUG=1 -``` \ No newline at end of file diff --git a/data/hot-lora.txt b/data/hot-lora.txt index c43186710e906..e88891d2f5eaf 100644 --- a/data/hot-lora.txt +++ b/data/hot-lora.txt @@ -1,2 +1,2 @@ - how are you? +test data to train adapter \ No newline at end of file diff --git a/examples/main/main.cpp b/examples/main/main.cpp index ba76a496b5999..b97b7b7937f02 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -117,9 +117,7 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v LOG_TEE("%s", text); } - int main(int argc, char ** argv) { - gpt_params params; g_params = ¶ms; diff --git a/llama.cpp b/llama.cpp index adfbb4828ccb3..8a5a71c77d84a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2544,7 +2544,6 @@ struct llama_context { } llama_cparams cparams; - bool lora_loaded = false; std::map lora_weights_map; // only one LoRA adapter at the moment lora_data llora_data; float lora_scale = 1.0f; @@ -16309,7 +16308,7 @@ void llama_free_model(struct llama_model * model) { } -static std::map get_lora_weights_map_cpp(struct ggml_context* ctx) { +static std::map get_lora_weights_map(struct ggml_context* ctx) { struct lora_tensor_pair* pair = build_lora_weights_map(ctx); std::map map; @@ -16370,7 +16369,7 @@ struct llama_context * llama_new_context_with_model( lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras? lora_params->lora.push_back(lora); - // load all loras + // load all loras (only 1 supported here) std::vector loras; for (size_t i = 0; i < lora_params->lora.size(); ++i) { struct lora_data * llora_data = load_lora(&lora_params->lora[i]); @@ -16381,22 +16380,10 @@ struct llama_context * llama_new_context_with_model( if (loras.size() == 0) { fprintf(stderr, "warning: no lora adapters will be applied.\n"); } - // Assign data + // Assign data and get mapping (index 0 as only 1 lora is supoprted now) ctx->llora_data = *loras[0]; - ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx); - // std::vector keys; - // for (const auto& pair : ctx->lora_weights_map) { - // keys.push_back(pair.first); - - // ggml_tensor * tensorA = pair.second.loraA; - // ggml_tensor * tensorB = pair.second.loraB; - - // ggml_tensor * tensorA_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorA->type, 4, tensorA->ne); - // ggml_tensor * tensorB_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorB->type, 4, tensorB->ne); - - // } + ctx->lora_weights_map = get_lora_weights_map((ctx->llora_data).ctx); } - /// LORA load end const auto & hparams = model->hparams; From e481eb55599ea4cfb452b9d6589b5bfbeb2574bc Mon Sep 17 00:00:00 2001 From: Lorenzo Toniazzi Date: Mon, 8 Jul 2024 08:41:03 +0100 Subject: [PATCH 13/14] renames --- llama.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llama.cpp b/llama.cpp index 8a5a71c77d84a..ba6650ccac75e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2545,7 +2545,7 @@ struct llama_context { llama_cparams cparams; std::map lora_weights_map; // only one LoRA adapter at the moment - lora_data llora_data; + lora_data llama_lora_data; float lora_scale = 1.0f; std::vector backends; @@ -7699,21 +7699,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = lora_mul_mat(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = ggml_mul_mat_lora(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = lora_mul_mat(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = ggml_mul_mat_lora(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = lora_mul_mat(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = ggml_mul_mat_lora(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -9722,7 +9722,7 @@ struct llm_build_context { return gf; } - static ggml_tensor * lora_mul_mat( + static ggml_tensor * ggml_mul_mat_lora( llama_context & lctx, ggml_context * ctx0, ggml_tensor * weight, @@ -16372,17 +16372,17 @@ struct llama_context * llama_new_context_with_model( // load all loras (only 1 supported here) std::vector loras; for (size_t i = 0; i < lora_params->lora.size(); ++i) { - struct lora_data * llora_data = load_lora(&lora_params->lora[i]); - if (llora_data != NULL) { - loras.push_back(llora_data); + struct lora_data * llama_lora_data = load_lora(&lora_params->lora[i]); + if (llama_lora_data != NULL) { + loras.push_back(llama_lora_data); } } if (loras.size() == 0) { fprintf(stderr, "warning: no lora adapters will be applied.\n"); } // Assign data and get mapping (index 0 as only 1 lora is supoprted now) - ctx->llora_data = *loras[0]; - ctx->lora_weights_map = get_lora_weights_map((ctx->llora_data).ctx); + ctx->llama_lora_data = *loras[0]; + ctx->lora_weights_map = get_lora_weights_map((ctx->llama_lora_data).ctx); } /// LORA load end From 9d5089b5bfd15d055e16aad1d925052748102f86 Mon Sep 17 00:00:00 2001 From: Lorenzo Toniazzi Date: Mon, 8 Jul 2024 14:36:27 +0100 Subject: [PATCH 14/14] Add ff lora matmuls --- ggml.c | 1 + llama.cpp | 81 +++++++++++++++++++++++++++++-------------------------- 2 files changed, 44 insertions(+), 38 deletions(-) diff --git a/ggml.c b/ggml.c index 0fb8dafbd2ab5..a9cb2bc73b48e 100644 --- a/ggml.c +++ b/ggml.c @@ -5331,6 +5331,7 @@ struct ggml_tensor * ggml_group_norm_inplace( return ggml_group_norm_impl(ctx, a, n_groups, true); } + // ggml_mul_mat struct ggml_tensor * ggml_mul_mat( diff --git a/llama.cpp b/llama.cpp index ba6650ccac75e..986dae59cc07e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -121,6 +121,7 @@ static void llama_log_callback_default(ggml_log_level level, const char * text, ///////// LORA + struct lora_weights { ggml_tensor* loraA; ggml_tensor* loraB; @@ -2622,6 +2623,37 @@ struct llama_context { struct llama_control_vector cvec; }; + + +static ggml_tensor * ggml_mul_mat_lora( + llama_context * lctx, + ggml_context * ctx0, + ggml_tensor * weight, + ggml_tensor * cur) { + ggml_tensor * mm = ggml_mul_mat(ctx0, weight, cur); + + auto it = lctx->lora_weights_map.find(weight->name); + if (it == lctx->lora_weights_map.end()) { + return mm; + } + + ggml_tensor * loraA = it->second.loraA; + ggml_tensor * loraB = it->second.loraB; + + ggml_tensor * t_lora = ggml_mul_mat(ctx0, + loraB, + ggml_mul_mat(ctx0, loraA, cur) + ); + + if (lctx->lora_scale != 1.0f) { + t_lora = ggml_scale(ctx0, t_lora, lctx->lora_scale); + } + + ggml_tensor * t_patch = ggml_add(ctx0, mm, t_lora); + return t_patch; + +} + static size_t llama_get_device_count(const llama_model & model) { size_t count = 1; #if defined(GGML_USE_CUDA) @@ -7022,8 +7054,9 @@ static struct ggml_tensor * llm_build_ffn( llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, const llm_build_cb & cb, - int il) { - struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur; + int il, + struct llama_context * lctx = nullptr) { + struct ggml_tensor * tmp = up ? ggml_mul_mat_lora(lctx, ctx, up, cur) : cur; cb(tmp, "ffn_up", il); if (up_b) { @@ -7035,12 +7068,12 @@ static struct ggml_tensor * llm_build_ffn( switch (type_gate) { case LLM_FFN_SEQ: { - cur = ggml_mul_mat(ctx, gate, tmp); + cur = ggml_mul_mat_lora(lctx, ctx, gate, tmp); cb(cur, "ffn_gate", il); } break; case LLM_FFN_PAR: { - cur = ggml_mul_mat(ctx, gate, cur); + cur = ggml_mul_mat_lora(lctx, ctx, gate, cur); cb(cur, "ffn_gate", il); } break; } @@ -7088,7 +7121,7 @@ static struct ggml_tensor * llm_build_ffn( cb(cur, "ffn_gate_par", il); } - cur = ggml_mul_mat(ctx, down, cur); + cur = ggml_mul_mat_lora(lctx, ctx, down, cur); if (down_b) { cb(cur, "ffn_down", il); } @@ -7699,21 +7732,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat_lora(lctx, ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = ggml_mul_mat_lora(&lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat_lora(lctx, ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = ggml_mul_mat_lora(&lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat_lora(lctx, ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = ggml_mul_mat_lora(&lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -7762,7 +7795,8 @@ struct llm_build_context { model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + LLM_FFN_SILU, LLM_FFN_PAR, cb, il, + &lctx); cb(cur, "ffn_out", il); } else { // MoE branch @@ -9722,35 +9756,6 @@ struct llm_build_context { return gf; } - static ggml_tensor * ggml_mul_mat_lora( - llama_context & lctx, - ggml_context * ctx0, - ggml_tensor * weight, - ggml_tensor * cur) { - ggml_tensor * mm = ggml_mul_mat(ctx0, weight, cur); - - auto it = lctx.lora_weights_map.find(weight->name); - if (it == lctx.lora_weights_map.end()) { - return mm; - } - - ggml_tensor * loraA = it->second.loraA; - ggml_tensor * loraB = it->second.loraB; - - ggml_tensor * t_lora = ggml_mul_mat(ctx0, - loraB, - ggml_mul_mat(ctx0, loraA, cur) - ); - - if (lctx.lora_scale != 1.0f) { - t_lora = ggml_scale(ctx0, t_lora, lctx.lora_scale); - } - - ggml_tensor * t_patch = ggml_add(ctx0, mm, t_lora); - return t_patch; - -} - struct ggml_cgraph * build_phi3() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);