diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b82da9..79ea3c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,21 +1,22 @@ cmake_minimum_required(VERSION 3.12) -project(llava.cpp) +project(lmm.cpp) # Add "llama.cpp" submodule -add_subdirectory(llama.cpp) +add_subdirectory(llama.cpp llama.cpp/common) # Add "clip.cpp" library add_library(clip clip.cpp clip.h) target_link_libraries(clip PRIVATE ggml) -# Add target executable/library for llava.cpp -add_executable(llava main.cpp) +# Add target executable/library for main +add_executable(main main.cpp) # Link against clip and llama libs -target_link_libraries(llava clip llama) +target_link_libraries(main clip llama common) -# Include directories for llava -target_include_directories(llava PRIVATE +# Include directories for lmm +target_include_directories(main PRIVATE . ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common ) diff --git a/clip.cpp b/clip.cpp index ddd5d64..4a13ad8 100644 --- a/clip.cpp +++ b/clip.cpp @@ -4,68 +4,623 @@ #include #include #include +#include #include #include +#include +#include +#include #include "clip.h" -#include "llama.cpp/ggml.h" +#include "ggml.h" #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" // #define CLIP_DEBUG +static std::string format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), buf.size()); +} + +// +// key constants +// + +#define KEY_FTYPE "general.file_type" +#define KEY_NAME "general.name" +#define KEY_DESCRIPTION "general.description" +#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" +#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" +#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" +#define KEY_USE_GELU "clip.use_gelu" +#define KEY_N_EMBD "clip.%s.embedding_length" +#define KEY_N_FF "clip.%s.feed_forward_length" +#define KEY_N_BLOCK "clip.%s.block_count" +#define KEY_N_HEAD "clip.%s.attention.head_count" +#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" +#define KEY_PROJ_DIM "clip.%s.projection_dim" +#define KEY_TOKENS "tokenizer.ggml.tokens" +#define KEY_N_POSITIONS "clip.text.context_length" +#define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_PATCH_SIZE "clip.vision.patch_size" +#define KEY_IMAGE_MEAN "clip.vision.image_mean" +#define KEY_IMAGE_STD "clip.vision.image_std" + +// +// tensor name constants +// + +#define TN_TOKEN_EMBD "%s.token_embd.weight" +#define TN_POS_EMBD "%s.position_embd.weight" +#define TN_CLASS_EMBD "v.class_embd" +#define TN_PATCH_EMBD "v.patch_embd.weight" +#define TN_ATTN_K "%s.blk.%d.attn_k.%s" +#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" +#define TN_ATTN_V "%s.blk.%d.attn_v.%s" +#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" +#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" +#define TN_FFN_UP "%s.blk.%d.ffn_up.%s" +#define TN_LN_1 "%s.blk.%d.ln1.%s" +#define TN_LN_2 "%s.blk.%d.ln2.%s" +#define TN_LN_PRE "%s.pre_ln.%s" +#define TN_LN_POST "%s.post_ln.%s" +#define TN_TEXT_PROJ "text_projection.weight" +#define TN_VIS_PROJ "visual_projection.weight" +#define TN_LLAVA_PROJ "llava_projector.%s" + +// +// utilities to get data from a gguf file +// + +int get_key_idx(const gguf_context * ctx, const char * key) { + int i = gguf_find_key(ctx, key); + if (i == -1) { + fprintf(stderr, "key %s not found in file\n", key); + throw std::runtime_error(format("Missing required key: %s", key)); + } + + return i; +} + +const uint32_t get_u32(const gguf_context * ctx, std::string key) { + const int i = get_key_idx(ctx, key.c_str()); + + return gguf_get_val_u32(ctx, i); +} + +const float get_f32(const gguf_context * ctx, std::string key) { + const int i = get_key_idx(ctx, key.c_str()); + + return gguf_get_val_f32(ctx, i); +} + +struct ggml_tensor * get_tensor(struct ggml_context * ctx, std::string name) { + struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); + if (!cur) { + printf("unable to find tensor %s\n", name.c_str()); + throw std::runtime_error(format("unable to find tensor %s\n", name.c_str())); + } + + return cur; +} + +std::string get_ftype(int ftype) { + switch (ftype) { + case 0: + return "f32"; + break; + case 1: + return "f16"; + break; + case 2: + return "q4_0"; + break; + case 3: + return "q4_1"; + break; + case 6: + return "q5_0"; + break; + case 7: + return "q5_1"; + break; + case 8: + return "q8_0"; + break; + default: + throw std::runtime_error(format("Unrecognized file type: %d\n", ftype)); + } +} + +// +// Vocab utils +// + +struct clip_vocab { + using id = clip_vocab_id; + using token = std::string; + + std::map token_to_id; + std::map id_to_token; + std::vector special_tokens; + + // void add_special_token(const std::string & token); +}; + +// +// clip layers +// + +struct clip_layer { + // attention + struct ggml_tensor * k_w; + struct ggml_tensor * k_b; + struct ggml_tensor * q_w; + struct ggml_tensor * q_b; + struct ggml_tensor * v_w; + struct ggml_tensor * v_b; + + struct ggml_tensor * o_w; + struct ggml_tensor * o_b; + + // layernorm 1 + struct ggml_tensor * ln_1_w; + struct ggml_tensor * ln_1_b; + + // ff + struct ggml_tensor * ff_i_w; + struct ggml_tensor * ff_i_b; + + struct ggml_tensor * ff_o_w; + struct ggml_tensor * ff_o_b; + + // layernorm 2 + struct ggml_tensor * ln_2_w; + struct ggml_tensor * ln_2_b; +}; + +struct clip_text_model { + struct clip_text_hparams hparams; + + // embeddings + struct ggml_tensor * token_embeddings; + struct ggml_tensor * position_embeddings; + + std::vector layers; + + struct ggml_tensor * post_ln_w; + struct ggml_tensor * post_ln_b; + + struct ggml_tensor * projection; +}; + +struct clip_vision_model { + struct clip_vision_hparams hparams; + + // embeddings + struct ggml_tensor * class_embedding; + struct ggml_tensor * patch_embeddings; + struct ggml_tensor * position_embeddings; + + struct ggml_tensor * pre_ln_w; + struct ggml_tensor * pre_ln_b; + + std::vector layers; + + struct ggml_tensor * post_ln_w; + struct ggml_tensor * post_ln_b; + + struct ggml_tensor * projection; + + // LLaVA projection + struct ggml_tensor * llava_proj_w; + struct ggml_tensor * llava_proj_b; +}; + +// Replacement for std::vector that doesn't require zero-initialization. +struct clip_buffer { + uint8_t * data = NULL; + size_t size = 0; + + void resize(size_t size) { + delete[] data; + data = new uint8_t[size]; + this->size = size; + } + + ~clip_buffer() { delete[] data; } +}; + +struct clip_ctx { + bool has_text_encoder = false; + bool has_vision_encoder = false; + bool has_llava_projector = false; + struct clip_text_model text_model; + struct clip_vision_model vision_model; + struct clip_vocab vocab; + float image_mean[3]; + float image_std[3]; + bool use_gelu = false; + int32_t ftype = 1; + struct ggml_context * ctx; + struct gguf_context * ctx_gguf; + struct clip_buffer buf_compute; +}; + +// +// memory allocation and management +// + // utility function for a workaround until https://github.com/ggerganov/ggml/issues/260 is resolved // after that, remove this and use the mechanism implemented in GGML directly -size_t get_mem_req_by_size(const size_t n_tensors, const int n_image_positions) { +size_t get_mem_req_by_size(struct clip_ctx * ctx) { size_t mb = 1024 * 1024; + const int n_tensors = gguf_get_n_tensors(ctx->ctx_gguf); + const auto & vision_hparams = clip_get_vision_hparams(ctx); + const int n_positions = + ctx->has_vision_encoder ? vision_hparams->image_size * vision_hparams->image_size / vision_hparams->patch_size + 1 : 77; switch (n_tensors) { - case 397: // base - if (n_image_positions == 50) { // patch size = 32 + case 397: // base, two-tower + case 200: // base, vision-only + if (n_positions == 50) { // patch size = 32 return 12 * mb; } else { // patch size = 16 return 24 * mb; } - case 589: // large - if (n_image_positions == 257) { // input image size = 224 + case 197: // base or large, text-only + return 12 * mb; + case 589: // large, two-tower + case 392: // large, vision-only + case 377: // large, LLaVA encoder + if (n_positions == 257) { // input image size = 224 return 24 * mb; } else { // input image size = 336 return 60 * mb; } - case 909: // huge + case 909: // huge, two-tower + case 520: // huge, vision-only return 232 * mb; + case 389: // huge, text-only + return 120 * mb; default: - fprintf(stderr, "%s: Unrecognized number of tensors: %zu. Check if you pass the correct model file\n", __func__, + fprintf(stderr, "%s: Unrecognized number of tensors: %d. Check if you pass the correct model file\n", __func__, n_tensors); exit(1); } } -size_t get_scr_buf_req_by_size(const size_t n_tensors, const int n_positions) { +size_t get_scr_buf_req_by_size(struct clip_ctx * ctx) { size_t mb = 1024 * 1024; + + const int n_tensors = gguf_get_n_tensors(ctx->ctx_gguf); + const auto & vision_hparams = clip_get_vision_hparams(ctx); + const int n_positions = + ctx->has_vision_encoder ? vision_hparams->image_size * vision_hparams->image_size / vision_hparams->patch_size + 1 : 77; + switch (n_tensors) { case 397: + case 200: if (n_positions <= 50) { return 32 * mb; } else { return 96 * mb; } + case 197: + return 32 * mb; case 589: + case 392: + case 377: if (n_positions <= 257) { return 96 * mb; } else { return 192 * mb; } case 909: + case 520: return 144 * mb; + case 389: + return 60 * mb; default: - fprintf(stderr, "%s: Unrecognized number of tensors: %zu. Check if you pass the correct model file\n", __func__, + fprintf(stderr, "%s: Unrecognized number of tensors: %d. Check if you pass the correct model file\n", __func__, n_tensors); exit(1); } } -std::vector clip_tokenize(const clip_ctx * ctx, const std::string & text) { +// read and create ggml_context containing the tensors and their data +struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { + + struct ggml_context * meta = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &meta, + }; + + struct gguf_context * ctx = gguf_init_from_file(fname, params); + + if (verbosity >= 1) { + const int n_tensors = gguf_get_n_tensors(ctx); + const int n_kv = gguf_get_n_kv(ctx); + const int ftype = get_u32(ctx, KEY_FTYPE); + const std::string ftype_str = get_ftype(ftype); + const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION); + const std::string description = gguf_get_val_str(ctx, idx_desc); + const int idx_name = gguf_find_key(ctx, KEY_NAME); + if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug + const std::string name = gguf_get_val_str(ctx, idx_name); + printf("%s: model name: %s\n", __func__, name.c_str()); + } + printf("%s: description: %s\n", __func__, description.c_str()); + printf("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); + printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); + printf("%s: n_tensors: %d\n", __func__, n_tensors); + printf("%s: n_kv: %d\n", __func__, n_kv); + printf("%s: ftype: %s\n", __func__, ftype_str.c_str()); + printf("\n"); + } + + // kv + if (verbosity >= 3) { + const int n_kv = gguf_get_n_kv(ctx); + + for (int i = 0; i < n_kv; ++i) { + const char * key = gguf_get_key(ctx, i); + + printf("%s: kv[%d]: key = %s\n", __func__, i, key); + } + printf("\n"); + } + + // data + size_t ctx_size = 0; + { + const int n_tensors = gguf_get_n_tensors(ctx); + + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx, i); + const size_t offset = gguf_get_tensor_offset(ctx, i); + + struct ggml_tensor * cur = ggml_get_tensor(meta, name); + ctx_size += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; + size_t tensor_size = ggml_nbytes(cur); + size_t padded_size = ggml_nbytes_pad(cur); + ctx_size += padded_size; + if (verbosity >= 3) { + printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i, + cur->n_dims, cur->name, tensor_size, padded_size, offset); + } + } + } + + clip_ctx * new_clip = new clip_ctx; + + // model size and capabilities + { + int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC); + new_clip->has_text_encoder = gguf_get_val_bool(ctx, idx); + + idx = get_key_idx(ctx, KEY_HAS_VIS_ENC); + new_clip->has_vision_encoder = gguf_get_val_bool(ctx, idx); + + idx = gguf_find_key(ctx, KEY_HAS_LLAVA_PROJ); + if (idx != -1) { + new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx); + } + + idx = get_key_idx(ctx, KEY_USE_GELU); + new_clip->use_gelu = gguf_get_val_bool(ctx, idx); + + if (verbosity >= 1) { + printf("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); + printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); + printf("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); + printf("%s: model size: %.2f MB\n", __func__, (ctx_size / 1024.0 / 1024.0)); + printf("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); + } + } + + // load tensors + { + struct ggml_init_params params = { + .mem_size = ctx_size, + .mem_buffer = NULL, + .no_alloc = false, + }; + + new_clip->ctx = ggml_init(params); + if (!new_clip->ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + clip_free(new_clip); + return nullptr; + } + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + printf("cannot open model file for loading tensors\n"); + clip_free(new_clip); + return nullptr; + } + + const int n_tensors = gguf_get_n_tensors(ctx); + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx, i); + struct ggml_tensor * t = ggml_get_tensor(meta, name); + struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t); + ggml_set_name(cur, name); + + const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); + fin.seekg(offset, std::ios::beg); + if (!fin) { + printf("%s: failed to seek for tensor %s\n", __func__, name); + clip_free(new_clip); + return nullptr; + } + + fin.read(reinterpret_cast(cur->data), ggml_nbytes(t)); + } + + fin.close(); + } + + // text model + if (new_clip->has_text_encoder) { + // load text model + auto & text_model = new_clip->text_model; + auto & hparams = text_model.hparams; + hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "text")); + hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "text")); + hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "text")); + hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "text")); + hparams.num_positions = get_u32(ctx, KEY_N_POSITIONS); + hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "text")); + hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "text")); + + const int idx_tokens = get_key_idx(ctx, KEY_TOKENS); + hparams.n_vocab = gguf_get_arr_n(ctx, idx_tokens); + auto & vocab = new_clip->vocab; + for (int id = 0; id < hparams.n_vocab; ++id) { + const std::string token = gguf_get_arr_str(ctx, idx_tokens, id); + vocab.id_to_token[id] = token; + vocab.token_to_id[token] = id; + } + + if (verbosity >= 2) { + printf("\n%s: text model hparams\n", __func__); + printf("n_vocab %d\n", hparams.n_vocab); + printf("num_positions %d\n", hparams.num_positions); + printf("t_hidden_size %d\n", hparams.hidden_size); + printf("t_n_intermediate %d\n", hparams.n_intermediate); + printf("t_projection_dim %d\n", hparams.projection_dim); + printf("t_n_head %d\n", hparams.n_head); + printf("t_n_layer %d\n", hparams.n_layer); + } + + text_model.token_embeddings = get_tensor(new_clip->ctx, format(TN_TOKEN_EMBD, "t")); + text_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "t")); + text_model.post_ln_w = get_tensor(new_clip->ctx, format(TN_LN_POST, "t", "weight")); + text_model.post_ln_b = get_tensor(new_clip->ctx, format(TN_LN_POST, "t", "bias")); + text_model.projection = get_tensor(new_clip->ctx, TN_TEXT_PROJ); + text_model.layers.resize(hparams.n_layer); + for (int il = 0; il < hparams.n_layer; ++il) { + auto & layer = text_model.layers[il]; + layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "t", il, "weight")); + layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "t", il, "weight")); + layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "t", il, "weight")); + layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "t", il, "weight")); + layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "t", il, "weight")); + layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "t", il, "weight")); + layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "t", il, "weight")); + layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "t", il, "weight")); + layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "t", il, "bias")); + layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "t", il, "bias")); + layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "t", il, "bias")); + layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "t", il, "bias")); + layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "t", il, "bias")); + layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "t", il, "bias")); + layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "t", il, "bias")); + layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "t", il, "bias")); + } + } + + // vision model + if (new_clip->has_vision_encoder) { + // load vision model + auto & vision_model = new_clip->vision_model; + auto & hparams = vision_model.hparams; + hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision")); + hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision")); + hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision")); + hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision")); + hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE); + hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE); + hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision")); + hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision")); + + int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN); + int idx_std = get_key_idx(ctx, KEY_IMAGE_STD); + for (int i = 0; i < 3; ++i) { + new_clip->image_mean[i] = *((float *)gguf_get_arr_data(ctx, idx_mean)); + new_clip->image_std[i] = *((float *)gguf_get_arr_data(ctx, idx_std)); + } + + if (verbosity >= 2) { + printf("\n%s: vision model hparams\n", __func__); + printf("image_size %d\n", hparams.image_size); + printf("patch_size %d\n", hparams.patch_size); + printf("v_hidden_size %d\n", hparams.hidden_size); + printf("v_n_intermediate %d\n", hparams.n_intermediate); + printf("v_projection_dim %d\n", hparams.projection_dim); + printf("v_n_head %d\n", hparams.n_head); + printf("v_n_layer %d\n", hparams.n_layer); + } + + vision_model.patch_embeddings = get_tensor(new_clip->ctx, TN_PATCH_EMBD); + vision_model.class_embedding = get_tensor(new_clip->ctx, TN_CLASS_EMBD); + vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v")); + vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight")); + vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias")); + vision_model.post_ln_w = get_tensor(new_clip->ctx, format(TN_LN_POST, "v", "weight")); + vision_model.post_ln_b = get_tensor(new_clip->ctx, format(TN_LN_POST, "v", "bias")); + if (new_clip->has_llava_projector) { + vision_model.llava_proj_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, "weight")); + vision_model.llava_proj_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, "bias")); + } else { + vision_model.projection = get_tensor(new_clip->ctx, TN_VIS_PROJ); + } + vision_model.layers.resize(hparams.n_layer); + for (int il = 0; il < hparams.n_layer; ++il) { + auto & layer = vision_model.layers[il]; + layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "weight")); + layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "weight")); + layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "weight")); + layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "weight")); + layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "weight")); + layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "weight")); + layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "weight")); + layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "weight")); + layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "bias")); + layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "bias")); + layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "bias")); + layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "bias")); + layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "bias")); + layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "bias")); + layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "bias")); + layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "bias")); + } + } + + ggml_free(meta); + + new_clip->ctx_gguf = ctx; + + const size_t mem_req = get_mem_req_by_size(new_clip); + new_clip->buf_compute.resize(mem_req); + if (verbosity >= 1) { + printf("\n%s: %zu MB of memory allocated\n", __func__, mem_req / 1024 / 1024); + } + + return new_clip; +} + +bool clip_tokenize(const clip_ctx * ctx, const char * text, struct clip_tokens * tokens) { + if (!ctx->has_text_encoder) { + printf("This GGUF file seems to have no text encoder\n"); + return false; + } + std::vector words; // first split the text into words @@ -98,8 +653,8 @@ std::vector clip_tokenize(const clip_ctx * ctx, const std::strin } } - std::vector tokens; - tokens.push_back(49406); // startoftext + std::vector v_tokens; + v_tokens.push_back(49406); // startoftext for (const auto & word : words) { // feel lucky? let's try if it's a full word @@ -113,7 +668,7 @@ std::vector clip_tokenize(const clip_ctx * ctx, const std::strin full_word += ""; auto wit = ctx->vocab.token_to_id.find(full_word); if (wit != ctx->vocab.token_to_id.end()) { - tokens.push_back(wit->second); + v_tokens.push_back(wit->second); continue; } @@ -122,7 +677,7 @@ std::vector clip_tokenize(const clip_ctx * ctx, const std::strin auto cand = word.substr(i, j - i + 1); auto it = ctx->vocab.token_to_id.find(cand); if (it != ctx->vocab.token_to_id.end()) { // word.substr(i, j-i+1) in vocab - tokens.push_back(it->second); + v_tokens.push_back(it->second); i = j + 1; break; } else if (j == i) { // word.substr(i, 1) has no matching @@ -133,23 +688,33 @@ std::vector clip_tokenize(const clip_ctx * ctx, const std::strin } } - tokens.push_back(49407); // endoftext + v_tokens.push_back(49407); // endoftext - return tokens; + tokens->size = v_tokens.size(); + + tokens->data = new int[v_tokens.size()]; + std::copy(v_tokens.begin(), v_tokens.end(), tokens->data); + + return true; } -bool clip_image_load_from_file(const std::string & fname, clip_image_u8 & img) { +clip_image_u8 * make_clip_image_u8() { return new clip_image_u8(); } + +clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); } + +bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { int nx, ny, nc; - auto data = stbi_load(fname.c_str(), &nx, &ny, &nc, 3); + auto data = stbi_load(fname, &nx, &ny, &nc, 3); if (!data) { - fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str()); + fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname); return false; } - img.nx = nx; - img.ny = ny; - img.data.resize(nx * ny * 3); - memcpy(img.data.data(), data, nx * ny * 3); + img->nx = nx; + img->ny = ny; + img->size = nx * ny * 3; + img->data = new uint8_t[img->size](); + memcpy(img->data, data, img->size); stbi_image_free(data); @@ -159,6 +724,11 @@ bool clip_image_load_from_file(const std::string & fname, clip_image_u8 & img) { // normalize: x = (x - mean) / std // TODO: implement bicubic interpolation instead of linear. bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res) { + if (!ctx->has_vision_encoder) { + printf("This gguf file seems to have no vision encoder\n"); + return false; + } + const int nx = img->nx; const int ny = img->ny; @@ -167,15 +737,16 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip res->nx = nx2; res->ny = ny2; - res->data.resize(3 * nx2 * ny2); + res->size = 3 * nx2 * ny2; + res->data = new float[res->size](); const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size; const int nx3 = int(nx / scale + 0.5f); const int ny3 = int(ny / scale + 0.5f); - const float m3[3] = {0.48145466f, 0.4578275f, 0.40821073f}; - const float s3[3] = {0.26862954f, 0.26130258f, 0.27577711f}; + const auto & m3 = ctx->image_mean; // {0.48145466f, 0.4578275f, 0.40821073f}; + const auto & s3 = ctx->image_std; // {0.26862954f, 0.26130258f, 0.27577711f}; for (int y = 0; y < ny3; y++) { for (int x = 0; x < nx3; x++) { @@ -219,6 +790,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip return true; } + // Structure to hold the image data as an input to function to be executed for thread typedef struct { const clip_image_u8 * input; @@ -240,34 +812,35 @@ void * preprocess_image(void * arg) { } // Function to batch-preprocess multiple images i -void clip_image_batch_preprocess(const clip_ctx * ctx, const int n_threads, const std::vector & img_inputs, - std::vector & imgs_resized) { - GGML_ASSERT(img_inputs.size() == imgs_resized.size()); - int num_threads = std::min(n_threads, static_cast(img_inputs.size())); +void clip_image_batch_preprocess(const clip_ctx * ctx, const int n_threads, const clip_image_u8_batch * img_inputs, + clip_image_f32_batch * imgs_resized) { + imgs_resized->size = img_inputs->size; + + int num_threads = std::min(n_threads, static_cast(img_inputs->size)); int i, t; // Divide the images among the threads - int images_per_thread = img_inputs.size() / num_threads; + int images_per_thread = img_inputs->size / num_threads; if (num_threads == 1) { // Single-threaded case - for (i = 0; i < img_inputs.size(); i++) { - clip_image_preprocess(ctx, &img_inputs[i], &imgs_resized[i]); + for (i = 0; i < img_inputs->size; i++) { + clip_image_preprocess(ctx, &img_inputs->data[i], &imgs_resized->data[i]); } } else { // Multi-threaded case std::vector threads(num_threads); - std::vector imageData(img_inputs.size()); + std::vector imageData(img_inputs->size); for (t = 0; t < num_threads; t++) { int start_index = t * images_per_thread; - int end_index = (t == num_threads - 1) ? img_inputs.size() : start_index + images_per_thread; + int end_index = (t == num_threads - 1) ? img_inputs->size : start_index + images_per_thread; // Create ImageData for each thread for (i = start_index; i < end_index; i++) { - imageData[i].input = &img_inputs[i]; - imageData[i].resized = &imgs_resized[i]; + imageData[i].input = &img_inputs->data[i]; + imageData[i].resized = &imgs_resized->data[i]; imageData[i].ctx = ctx; } @@ -282,537 +855,22 @@ void clip_image_batch_preprocess(const clip_ctx * ctx, const int n_threads, cons } } -struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { - if (verbosity >= 1) { - printf("%s: loading model from '%s' - please wait...", __func__, fname); - } - - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname); - return nullptr; - } - - // verify magic - { - uint32_t magic; - fin.read((char *)&magic, sizeof(magic)); - if (magic != 0x67676d6c) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname); - return nullptr; - } - } - - clip_ctx * new_clip = new clip_ctx; - clip_text_model & text_model = new_clip->text_model; - clip_vision_model & vision_model = new_clip->vision_model; - clip_vocab & vocab = new_clip->vocab; - - // load hparams for text - { - auto & hparams = text_model.hparams; - - fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *)&hparams.num_positions, sizeof(hparams.num_positions)); - fin.read((char *)&hparams.hidden_size, sizeof(hparams.hidden_size)); - fin.read((char *)&hparams.n_intermediate, sizeof(hparams.n_intermediate)); - fin.read((char *)&hparams.projection_dim, sizeof(hparams.projection_dim)); - fin.read((char *)&hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *)&hparams.n_layer, sizeof(hparams.n_layer)); - - if (verbosity >= 2) { - printf("\n%s: text model hparams\n", __func__); - printf("n_vocab %d\n", hparams.n_vocab); - printf("num_positions %d\n", hparams.num_positions); - printf("t_hidden_size %d\n", hparams.hidden_size); - printf("t_n_intermediate %d\n", hparams.n_intermediate); - printf("t_projection_dim %d\n", hparams.projection_dim); - printf("t_n_head %d\n", hparams.n_head); - printf("t_n_layer %d\n", hparams.n_layer); - } - } - - // load hparams for vision - { - auto & hparams = vision_model.hparams; - - fin.read((char *)&hparams.image_size, sizeof(hparams.image_size)); - fin.read((char *)&hparams.patch_size, sizeof(hparams.patch_size)); - fin.read((char *)&hparams.hidden_size, sizeof(hparams.hidden_size)); - fin.read((char *)&hparams.n_intermediate, sizeof(hparams.n_intermediate)); - fin.read((char *)&hparams.projection_dim, sizeof(hparams.projection_dim)); - fin.read((char *)&hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *)&hparams.n_layer, sizeof(hparams.n_layer)); - - fin.read((char *)&new_clip->use_gelu, sizeof(new_clip->use_gelu)); - fin.read((char *)&new_clip->ftype, sizeof(new_clip->ftype)); - - if (verbosity >= 2) { - printf("\n%s: vision model hparams\n", __func__); - printf("image_size %d\n", hparams.image_size); - printf("patch_size %d\n", hparams.patch_size); - printf("v_hidden_size %d\n", hparams.hidden_size); - printf("v_n_intermediate %d\n", hparams.n_intermediate); - printf("v_projection_dim %d\n", hparams.projection_dim); - printf("v_n_head %d\n", hparams.n_head); - printf("v_n_layer %d\n", hparams.n_layer); - - printf("\nuse_gelu %d\n", new_clip->use_gelu); - printf("ftype %d\n\n", new_clip->ftype); - } - } - - // load vocab - { - int32_t n_vocab = 0; - fin.read((char *)&n_vocab, sizeof(n_vocab)); - - if (n_vocab != new_clip->text_model.hparams.n_vocab) { - fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", __func__, fname, n_vocab, - new_clip->text_model.hparams.n_vocab); - return nullptr; - } - - std::string word; - std::vector buf(128); - - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - fin.read((char *)&len, sizeof(len)); - - buf.resize(len); - fin.read((char *)buf.data(), len); - word.assign(buf.data(), len); - - new_clip->vocab.token_to_id[word] = i; - new_clip->vocab.id_to_token[i] = word; - } - } - - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - ggml_type wtype = GGML_TYPE_COUNT; - switch (new_clip->ftype) { - case 0: - wtype = GGML_TYPE_F32; - break; - case 1: - wtype = GGML_TYPE_F16; - break; - case 2: - wtype = GGML_TYPE_Q4_0; - break; - case 3: - wtype = GGML_TYPE_Q4_1; - break; - default: { - fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname, new_clip->ftype); - clip_free(new_clip); - return nullptr; - } - } - - auto & ctx = new_clip->ctx; - size_t model_mem_req = 0; - - { - // calculate memory requirement for text_model - const auto & hparams = text_model.hparams; - - const int n_vocab = hparams.n_vocab; - const int num_positions = hparams.num_positions; - const int hidden_size = hparams.hidden_size; - const int n_layer = hparams.n_layer; - const int n_intermediate = hparams.n_intermediate; - const int projection_dim = hparams.projection_dim; - - // Calculate size requirements - - model_mem_req += hidden_size * n_vocab * ggml_type_sizef(wtype); // token_embeddings - model_mem_req += hidden_size * num_positions * ggml_type_sizef(wtype); // position_embeddings - - model_mem_req += 4 * n_layer * (hidden_size * ggml_type_sizef(GGML_TYPE_F32)); // ln_1_* and ln_2_* - - model_mem_req += 4 * n_layer * (hidden_size * hidden_size * ggml_type_sizef(wtype)); // kqvo weights - model_mem_req += 4 * n_layer * (hidden_size * ggml_type_sizef(GGML_TYPE_F32)); // kqvo bias - model_mem_req += 2 * n_layer * (hidden_size * n_intermediate * ggml_type_sizef(wtype)); // ff_*_w - model_mem_req += n_layer * (n_intermediate * ggml_type_sizef(GGML_TYPE_F32)); // ff_i_b - model_mem_req += n_layer * (hidden_size * ggml_type_sizef(GGML_TYPE_F32)); // ff_o_b - - model_mem_req += 2 * hidden_size * ggml_type_sizef(GGML_TYPE_F32); // post_ln_* - model_mem_req += 2 * hidden_size * projection_dim * ggml_type_sizef(wtype); // projection - - model_mem_req += (5 + 16 * n_layer) * 256; // object overhead - } - - { - // calculate memory requirement for vision_model - const auto & hparams = vision_model.hparams; - - const int image_size = hparams.image_size; - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size / patch_size) * (image_size / patch_size)) + 1; - const int hidden_size = hparams.hidden_size; - const int n_layer = hparams.n_layer; - const int n_intermediate = hparams.n_intermediate; - const int projection_dim = hparams.projection_dim; - - // Calculate size requirements - - model_mem_req += hidden_size * ggml_type_sizef(GGML_TYPE_F32); // class_embedding - model_mem_req += hidden_size * 3 * patch_size * patch_size * ggml_type_sizef(GGML_TYPE_F16); // patch_embeddings - model_mem_req += hidden_size * num_patches * ggml_type_sizef(wtype); // position_embeddings - - model_mem_req += 2 * hidden_size * ggml_type_sizef(GGML_TYPE_F32); // pre_ln_* - - model_mem_req += 4 * n_layer * (hidden_size * ggml_type_sizef(GGML_TYPE_F32)); // ln_* - - model_mem_req += 4 * n_layer * (hidden_size * hidden_size * ggml_type_sizef(wtype)); // kqvo weights - model_mem_req += 4 * n_layer * (hidden_size * ggml_type_sizef(GGML_TYPE_F32)); // kqvo bias - - model_mem_req += 2 * n_layer * (hidden_size * n_intermediate * ggml_type_sizef(wtype)); // ff_*_w - model_mem_req += n_layer * (n_intermediate * ggml_type_sizef(GGML_TYPE_F32)); // ff_i_b - model_mem_req += n_layer * (hidden_size * ggml_type_sizef(GGML_TYPE_F32)); // ff_o_b - - model_mem_req += 2 * hidden_size * ggml_type_sizef(GGML_TYPE_F32); // post_ln_* - model_mem_req += 2 * hidden_size * projection_dim * ggml_type_sizef(wtype); // projection - - model_mem_req += (5 + 16 * n_layer) * 256; // object overhead - } - - if (verbosity >= 2) { - printf("%s: ggml ctx size = %6.2f MB\n", __func__, model_mem_req / (1024.0 * 1024.0)); - } - - // create the ggml context - { - struct ggml_init_params params = { - .mem_size = model_mem_req, - .mem_buffer = NULL, - .no_alloc = false, - }; - - new_clip->ctx = ggml_init(params); - if (!new_clip->ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - clip_free(new_clip); - return nullptr; - } - } - - // prepare memory for the text_model weights - { - const auto & hparams = text_model.hparams; - - const int n_vocab = hparams.n_vocab; - const int num_positions = hparams.num_positions; - const int hidden_size = hparams.hidden_size; - const int n_layer = hparams.n_layer; - const int n_intermediate = hparams.n_intermediate; - const int projection_dim = hparams.projection_dim; - - text_model.layers.resize(n_layer); - - text_model.token_embeddings = ggml_new_tensor_2d(ctx, wtype, hidden_size, n_vocab); - text_model.position_embeddings = ggml_new_tensor_2d(ctx, wtype, hidden_size, num_positions); - - // map by name - text_model.tensors["text_model.embeddings.token_embedding.weight"] = text_model.token_embeddings; - text_model.tensors["text_model.embeddings.position_embedding.weight"] = text_model.position_embeddings; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = text_model.layers[i]; - - layer.ln_1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - layer.ln_2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - - layer.q_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size); - layer.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - layer.k_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size); - layer.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - layer.v_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size); - layer.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - layer.o_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size); - layer.o_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - - layer.ff_i_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, n_intermediate); - layer.ff_i_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_intermediate); - - layer.ff_o_w = ggml_new_tensor_2d(ctx, wtype, n_intermediate, hidden_size); - layer.ff_o_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - - // map by name - - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.k_proj.weight"] = layer.k_w; - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.k_proj.bias"] = layer.k_b; - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.v_proj.weight"] = layer.v_w; - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.v_proj.bias"] = layer.v_b; - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.q_proj.weight"] = layer.q_w; - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.q_proj.bias"] = layer.q_b; - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.out_proj.weight"] = layer.o_w; - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.out_proj.bias"] = layer.o_b; - - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".layer_norm1.weight"] = layer.ln_1_w; - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".layer_norm1.bias"] = layer.ln_1_b; - - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".mlp.fc1.weight"] = layer.ff_i_w; - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".mlp.fc1.bias"] = layer.ff_i_b; - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".mlp.fc2.weight"] = layer.ff_o_w; - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".mlp.fc2.bias"] = layer.ff_o_b; - - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".layer_norm2.weight"] = layer.ln_2_w; - text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".layer_norm2.bias"] = layer.ln_2_b; - } - - text_model.post_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - text_model.post_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - text_model.projection = ggml_new_tensor_2d(ctx, wtype, hidden_size, projection_dim); - - // map by name - text_model.tensors["text_model.final_layer_norm.weight"] = text_model.post_ln_w; - text_model.tensors["text_model.final_layer_norm.bias"] = text_model.post_ln_b; - text_model.tensors["text_projection.weight"] = text_model.projection; - } - - // prepare memory for the vision_model weights - { - const auto & hparams = vision_model.hparams; - - const int image_size = hparams.image_size; - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size / patch_size) * (image_size / patch_size)) + 1; - const int hidden_size = hparams.hidden_size; - const int n_layer = hparams.n_layer; - const int n_intermediate = hparams.n_intermediate; - const int projection_dim = hparams.projection_dim; - - vision_model.layers.resize(n_layer); - - vision_model.class_embedding = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - vision_model.patch_embeddings = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, patch_size, patch_size, 3, hidden_size); - vision_model.position_embeddings = ggml_new_tensor_2d(ctx, wtype, hidden_size, num_patches); - - vision_model.pre_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - vision_model.pre_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - - // map by name - vision_model.tensors["vision_model.embeddings.class_embedding"] = vision_model.class_embedding; - vision_model.tensors["vision_model.embeddings.patch_embedding.weight"] = vision_model.patch_embeddings; - vision_model.tensors["vision_model.embeddings.position_embedding.weight"] = vision_model.position_embeddings; - - vision_model.tensors["vision_model.pre_layrnorm.weight"] = vision_model.pre_ln_w; - vision_model.tensors["vision_model.pre_layrnorm.bias"] = vision_model.pre_ln_b; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = vision_model.layers[i]; - - layer.ln_1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - layer.ln_2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - - layer.q_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size); - layer.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - layer.k_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size); - layer.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - layer.v_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size); - layer.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - layer.o_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size); - layer.o_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - - layer.ff_i_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, n_intermediate); - layer.ff_i_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_intermediate); - - layer.ff_o_w = ggml_new_tensor_2d(ctx, wtype, n_intermediate, hidden_size); - layer.ff_o_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - - // map by name - - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.k_proj.weight"] = layer.k_w; - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.k_proj.bias"] = layer.k_b; - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.v_proj.weight"] = layer.v_w; - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.v_proj.bias"] = layer.v_b; - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.q_proj.weight"] = layer.q_w; - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.q_proj.bias"] = layer.q_b; - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.out_proj.weight"] = layer.o_w; - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.out_proj.bias"] = layer.o_b; - - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".layer_norm1.weight"] = layer.ln_1_w; - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".layer_norm1.bias"] = layer.ln_1_b; - - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".mlp.fc1.weight"] = layer.ff_i_w; - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".mlp.fc1.bias"] = layer.ff_i_b; - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".mlp.fc2.weight"] = layer.ff_o_w; - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".mlp.fc2.bias"] = layer.ff_o_b; - - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".layer_norm2.weight"] = layer.ln_2_w; - vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".layer_norm2.bias"] = layer.ln_2_b; - } - - vision_model.post_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - vision_model.post_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - vision_model.projection = ggml_new_tensor_2d(ctx, wtype, hidden_size, projection_dim); - - // map by name - vision_model.tensors["vision_model.post_layernorm.weight"] = vision_model.post_ln_w; - vision_model.tensors["vision_model.post_layernorm.bias"] = vision_model.post_ln_b; - vision_model.tensors["visual_projection.weight"] = vision_model.projection; - } - - // load weights - { - int n_tensors = 0; - size_t total_size = 0; - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ftype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ftype), sizeof(ftype)); - - if (fin.eof()) { - break; - } - - int64_t nelements = 1; - int64_t ne[4] = {1, 1, 1, 1}; - for (int i = 0; i < n_dims; ++i) { - int32_t ne_cur; - fin.read(reinterpret_cast(&ne_cur), sizeof(ne_cur)); - ne[i] = ne_cur; - nelements *= ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - struct ggml_tensor * tensor; - if (text_model.tensors.find(name.data()) != text_model.tensors.end()) { - tensor = text_model.tensors[name.data()]; - } else if (vision_model.tensors.find(name.data()) != vision_model.tensors.end()) { - tensor = vision_model.tensors[name.data()]; - } else { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); - clip_free(new_clip); - return nullptr; - } - - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); - clip_free(new_clip); - return nullptr; - } - - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n", - __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); - clip_free(new_clip); - return nullptr; - } - - if (0) { - static const char * ftype_str[] = { - "f32", - "f16", - "q4_0", - "q4_1", - }; - printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], - ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor)); - } - - size_t bpe = 0; - - switch (ftype) { - case 0: - bpe = ggml_type_size(GGML_TYPE_F32); - break; - case 1: - bpe = ggml_type_size(GGML_TYPE_F16); - break; - case 2: - bpe = ggml_type_size(GGML_TYPE_Q4_0); - assert(ne[0] % 64 == 0); - break; - case 3: - bpe = ggml_type_size(GGML_TYPE_Q4_1); - assert(ne[0] % 64 == 0); - break; - default: { - fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); - clip_free(new_clip); - return nullptr; - } - }; - - if ((nelements * bpe) / ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", __func__, name.data(), - ggml_nbytes(tensor), nelements * bpe); - clip_free(new_clip); - return nullptr; - } - - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - -#ifdef CLIP_DEBUG_TENSORS - printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", - ggml_nbytes(tensor) / 1024.0 / 1024.0); -#endif - - total_size += ggml_nbytes(tensor); - if (verbosity >= 1) { - if (++n_tensors % 8 == 0) { - printf("."); - fflush(stdout); - } - } - } - - if (verbosity >= 1) { - printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size / 1024.0 / 1024.0, n_tensors); - } - } - - fin.close(); - - // Calculate space requirements for setting up context buffers later - { - // TODO: We currently get the size of memory requirement from the pre-computed information - // based on the model variant, indicated by the number of tensors. - // Rewrite this logic when GGML implements a mechanism to predict the required memory. - const size_t n_tensors = new_clip->text_model.tensors.size() + new_clip->vision_model.tensors.size(); - const int n_image_positions = (vision_model.hparams.image_size / vision_model.hparams.patch_size) * - (vision_model.hparams.image_size / vision_model.hparams.patch_size) + - 1; - size_t mem_req = get_mem_req_by_size(n_tensors, n_image_positions); - new_clip->buf_compute.resize(mem_req); - - if (verbosity >= 2) { - printf("%s: %zu MB of compute buffer allocated\n", __func__, mem_req / 1024 / 1024); - } - } - - if (verbosity >= 1) { - printf("%s: model loaded\n\n", __func__); - } - - return new_clip; -} - void clip_free(clip_ctx * ctx) { ggml_free(ctx->ctx); + gguf_free(ctx->ctx_gguf); delete ctx; } -bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector & tokens, float * vec) { +bool clip_text_encode(const clip_ctx * ctx, const int n_threads, const clip_tokens * tokens, float * vec, + const bool normalize) { + if (!ctx->has_text_encoder) { + printf("This GGUF file seems to have no text encoder\n"); + return false; + } + const auto & model = ctx->text_model; const auto & hparams = model.hparams; - const int N = tokens.size(); + const size_t N = tokens->size; const int n_vocab = hparams.n_vocab; const int num_positions = hparams.num_positions; @@ -822,6 +880,7 @@ bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vectorbuf_compute; @@ -834,11 +893,11 @@ bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vectortext_model.tensors.size() + ctx->vision_model.tensors.size(), N); + static size_t scr0_size = get_scr_buf_req_by_size((struct clip_ctx *)ctx); static void * scr0 = malloc(scr0_size); struct ggml_tensor * input_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(input_ids->data, tokens.data(), N * ggml_element_size(input_ids)); + memcpy(input_ids->data, tokens->data, N * ggml_element_size(input_ids)); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); for (int i = 0; i < N; i++) { @@ -857,7 +916,7 @@ bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector imgs; - imgs.push_back(img); - return clip_image_batch_encode(ctx, n_threads, imgs, vec); +bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, const bool normalize) { + if (!ctx->has_vision_encoder) { + printf("This gguf file seems to have no vision encoder\n"); + return false; + } + + clip_image_f32_batch imgs{}; + imgs.size = 1; + imgs.data = img; + return clip_image_batch_encode(ctx, n_threads, &imgs, vec, normalize); } -bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vector & imgs, float * vec) { +bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, + const bool normalize) { + + if (!ctx->has_vision_encoder) { + printf("This gguf file seems to have no vision encoder\n"); + return false; + } + const auto & model = ctx->vision_model; const auto & hparams = model.hparams; @@ -1039,7 +1113,8 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec const int n_layer = hparams.n_layer; const int n_intermediate = hparams.n_intermediate; const int projection_dim = hparams.projection_dim; - int batch_size = imgs.size(); + const float eps = hparams.eps; + int batch_size = imgs->size; auto & buf_compute = ctx->buf_compute; @@ -1052,8 +1127,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - static size_t scr0_size = - get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), num_positions); + static size_t scr0_size = get_scr_buf_req_by_size((struct clip_ctx *)ctx); static void * scr0 = malloc(scr0_size); struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size); @@ -1061,9 +1135,9 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec { float * data = (float *)ggml_get_data(inp_raw); - for (int b = 0; b < imgs.size(); b++) { - const int nx = imgs[b].nx; - const int ny = imgs[b].ny; + for (int b = 0; b < imgs->size; b++) { + const int nx = imgs->data[b].nx; + const int ny = imgs->data[b].ny; GGML_ASSERT(nx == image_size && ny == image_size); const int n = nx * ny; @@ -1072,7 +1146,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec for (int k = 0; k < 3; k++) { for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { - data[(b * 3 * n) + k * n + y * nx + x] = imgs[b].data[3 * (y * nx + x) + k]; + data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].data[3 * (y * nx + x) + k]; } } } @@ -1106,7 +1180,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec // pre-layernorm { - embeddings = ggml_norm(ctx0, embeddings); + embeddings = ggml_norm(ctx0, embeddings, eps); embeddings = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.pre_ln_w, embeddings), embeddings), ggml_repeat(ctx0, model.pre_ln_b, embeddings)); @@ -1122,7 +1196,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec // layernorm1 { - cur = ggml_norm(ctx0, cur); + cur = ggml_norm(ctx0, cur, eps); cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_w, cur), cur), ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); @@ -1172,7 +1246,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec // layernorm2 { - cur = ggml_norm(ctx0, cur); + cur = ggml_norm(ctx0, cur, eps); cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_w, cur), cur), ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); @@ -1205,7 +1279,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec // post-layernorm { - embeddings = ggml_norm(ctx0, embeddings); + embeddings = ggml_norm(ctx0, embeddings, eps); embeddings = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.post_ln_w, embeddings), embeddings), ggml_repeat(ctx0, model.post_ln_b, embeddings)); @@ -1221,8 +1295,10 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec for (int b = 0; b < batch_size; b++) { struct ggml_tensor * embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, b)); - ggml_tensor * length = ggml_sqrt(ctx0, ggml_sum(ctx0, ggml_sqr(ctx0, embedding))); - embedding = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length)); + if (normalize) { + ggml_tensor * length = ggml_sqrt(ctx0, ggml_sum(ctx0, ggml_sqr(ctx0, embedding))); + embedding = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length)); + } output = ggml_acc(ctx0, output, embedding, output->nb[1], output->nb[2], output->nb[3], b * ggml_nbytes(embedding)); } ggml_set_name(output, "check"); @@ -1230,6 +1306,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec // run the computation ggml_build_forward_expand(&gf, output); ggml_cplan cplan = ggml_graph_plan(&gf, n_threads); + cplan.work_size *= batch_size; if (cplan.work_size != 0) { cplan.work_data = (uint8_t *)malloc(cplan.work_size); } @@ -1293,39 +1370,45 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec return true; } -float clip_similarity_score(float * vec1, float * vec2, int vec_dim) { +float clip_similarity_score(const float * vec1, const float * vec2, const int vec_dim) { float dot_product = 0.0; for (int i = 0; i < vec_dim; i++) { dot_product += vec1[i] * vec2[i]; } - // Clamp the dot product to the range [0, 1]. - float clamped_dot_product = fmin(fmax(dot_product, 0.0), 1.0); - - return clamped_dot_product; + return dot_product; } -bool clip_compare_text_and_image(clip_ctx * ctx, int n_threads, std::string & text, clip_image_u8 & image, float * score) { +bool clip_compare_text_and_image(const clip_ctx * ctx, const int n_threads, const char * text, const clip_image_u8 * image, + float * score) { + if (!(ctx->has_text_encoder && ctx->has_vision_encoder)) { + printf("clip_compare_text_and_image function can only be used with two-tower models\n"); + return false; + } + // prepare image and text vectors const int projection_dim = ctx->vision_model.hparams.projection_dim; float img_vec[projection_dim]; float txt_vec[projection_dim]; - // preprocess and encode image - clip_image_f32 img_res; - - if (!clip_image_preprocess(ctx, &image, &img_res)) { + // tokenize and encode text + clip_tokens tokens; + if (!clip_tokenize(ctx, text, &tokens)) { return false; } - if (!clip_image_encode(ctx, n_threads, img_res, img_vec)) { + if (!clip_text_encode(ctx, n_threads, &tokens, txt_vec, true)) { return false; } - // tokenize and encode text - auto tokens = clip_tokenize(ctx, text); + // preprocess and encode image + clip_image_f32 img_res; - if (!clip_text_encode(ctx, n_threads, tokens, txt_vec)) { + if (!clip_image_preprocess(ctx, image, &img_res)) { + return false; + } + + if (!clip_image_encode(ctx, n_threads, &img_res, img_vec, true)) { return false; } @@ -1353,23 +1436,17 @@ int compare_scores(const void * a, const void * b) { } } -bool softmax_with_sorting(float * arr, int length, float * sorted_scores, int * indices) { +bool softmax_with_sorting(float * arr, const int length, float * sorted_scores, int * indices) { ScoreIndexPair * score_index_pairs = (ScoreIndexPair *)malloc(length * sizeof(ScoreIndexPair)); if (!score_index_pairs) { return false; } // Calculate softmax probabilities - float max_val = arr[0]; - for (int i = 1; i < length; i++) { - if (arr[i] > max_val) { - max_val = arr[i]; - } - } - float sum = 0.0; + double sum = 0.0; for (int i = 0; i < length; i++) { - arr[i] = exp(arr[i] - max_val); + arr[i] = exp(arr[i]) + 1e-9; sum += arr[i]; } @@ -1392,23 +1469,227 @@ bool softmax_with_sorting(float * arr, int length, float * sorted_scores, int * return true; } -bool image_normalize(clip_image_u8 * img, clip_image_f32 * res) { - if (img->nx != 224 || img->ny != 224) { - printf("%s: long input shape: %d x %d\n", __func__, img->nx, img->ny); +bool clip_zero_shot_label_image(struct clip_ctx * ctx, const int n_threads, const struct clip_image_u8 * input_img, + const char ** labels, const size_t n_labels, float * scores, int * indices) { + if (!(ctx->has_text_encoder && ctx->has_vision_encoder)) { + printf("clip_zero_shot_label_image function can only be used with two-tower models\n"); return false; } - const float m3[3] = {0.48145466f, 0.4578275f, 0.40821073f}; - const float s3[3] = {0.26862954f, 0.26130258f, 0.27577711f}; + // load the image + clip_image_f32 img_res; + + const int vec_dim = clip_get_vision_hparams(ctx)->projection_dim; - for (int y = 0; y < img->ny; y++) { - for (int x = 0; x < img->nx; x++) { - for (int c = 0; c < 3; c++) { - const int i = 3 * (y * img->nx + x) + c; - float v = (float)img->data[i]; - res->data[i] = ((v / 255.0f) - m3[c]) / s3[c]; + clip_image_preprocess(ctx, input_img, &img_res); + + float img_vec[vec_dim]; + if (!clip_image_encode(ctx, n_threads, &img_res, img_vec, false)) { + return false; + } + + // encode texts and compute similarities + float txt_vec[vec_dim]; + float similarities[n_labels]; + + for (int i = 0; i < n_labels; i++) { + const auto & text = labels[i]; + clip_tokens tokens; + clip_tokenize(ctx, text, &tokens); + clip_text_encode(ctx, n_threads, &tokens, txt_vec, false); + similarities[i] = clip_similarity_score(img_vec, txt_vec, vec_dim); + } + + // apply softmax and sort scores + softmax_with_sorting(similarities, n_labels, scores, indices); + + return true; +} + +bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) { + + ggml_type type = GGML_TYPE_Q4_1; + + switch (itype) { + case 2: + type = GGML_TYPE_Q4_0; + break; + case 3: + type = GGML_TYPE_Q4_1; + break; + case 6: + type = GGML_TYPE_Q5_0; + break; + case 7: + type = GGML_TYPE_Q5_1; + break; + case 8: + type = GGML_TYPE_Q8_0; + break; + default: + fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); + return false; + }; + + auto ctx_clip = clip_model_load(fname_inp, 2); + const auto & ctx_src = ctx_clip->ctx_gguf; + const auto & ctx_data = ctx_clip->ctx; + + auto ctx_out = gguf_init_empty(); + gguf_set_kv(ctx_out, ctx_src); + gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); + gguf_set_val_u32(ctx_out, "general.file_type", itype); + + auto fout = std::ofstream(fname_out, std::ios::binary); + + const int n_tensors = gguf_get_n_tensors(ctx_src); + + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_src, i); + struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); + gguf_add_tensor(ctx_out, cur); + } + + const size_t meta_size = gguf_get_meta_size(ctx_out); + for (size_t i = 0; i < meta_size; ++i) { + fout.put(0); + } + + // regexes of tensor names to be quantized + const std::vector k_names = { + ".*weight", + }; + + std::vector read_data(512); + std::vector work(512); + std::vector conv_buf(512); + std::vector hist_all(1 << 4, 0); + size_t total_size_org = 0; + size_t total_size_new = 0; + + for (int i = 0; i < n_tensors; ++i) { + const std::string name = gguf_get_tensor_name(ctx_src, i); + struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str()); + + enum ggml_type new_type; + void * new_data; + size_t new_size; + + bool quantize = false; + for (const auto & s : k_names) { + if (std::regex_match(name, std::regex(s))) { + quantize = true; + break; + } + } + + // quantize only 2D tensors + quantize &= (cur->n_dims == 2); + + if (quantize) { + new_type = type; + const size_t n_elms = ggml_nelements(cur); + float * f32_data; + + switch (cur->type) { + case GGML_TYPE_F32: + f32_data = (float *)cur->data; + break; + case GGML_TYPE_F16: + if (conv_buf.size() < n_elms) { + conv_buf.resize(n_elms); + } + for (int j = 0; j < n_elms; ++j) { + conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]); + } + f32_data = (float *)conv_buf.data(); + break; + default: + printf("Please use an input file in f32 or f16\n"); + return false; + } + + if (work.size() < n_elms * 4) { + work.resize(n_elms * 4); + } + new_data = work.data(); + + std::vector hist_cur(1 << 4, 0); + + switch (new_type) { + case GGML_TYPE_Q4_0: { + new_size = ggml_quantize_q4_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q4_1: { + new_size = ggml_quantize_q4_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q5_0: { + new_size = ggml_quantize_q5_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q5_1: { + new_size = ggml_quantize_q5_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q8_0: { + new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); + } break; + default: { + fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type); + return false; + } } + + for (int j = 0; j < hist_cur.size(); ++j) { + hist_all[j] += hist_cur[j]; + } + } else { + new_type = cur->type; + new_data = cur->data; + new_size = ggml_nbytes(cur); } + const size_t orig_size = ggml_nbytes(cur); + total_size_org += orig_size; + total_size_new += new_size; + gguf_set_tensor_type(ctx_out, name.c_str(), new_type); + gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size); + fout.write((const char *)new_data, new_size); + size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size; + for (int j = 0; j < pad; ++j) { + fout.put(0); + } + + printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize, + orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); + } + + // go back to beginning of file and write the updated metadata + fout.seekp(0, std::ios::beg); + std::vector meta(meta_size); + gguf_get_meta_data(ctx_out, meta.data()); + fout.write((const char *)meta.data(), meta_size); + + fout.close(); + + clip_free(ctx_clip); + gguf_free(ctx_out); + + { + printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); + printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); + + int64_t sum_all = 0; + for (size_t i = 0; i < hist_all.size(); ++i) { + sum_all += hist_all[i]; + } + + printf("%s: hist: ", __func__); + for (size_t i = 0; i < hist_all.size(); ++i) { + printf("%5.3f ", hist_all[i] / (float)sum_all); + } + printf("\n"); } + return true; } + +struct clip_text_hparams * clip_get_text_hparams(struct clip_ctx * ctx) { return &ctx->text_model.hparams; } +struct clip_vision_hparams * clip_get_vision_hparams(struct clip_ctx * ctx) { return &ctx->vision_model.hparams; } diff --git a/clip.h b/clip.h index 3cbbc66..18fe3da 100644 --- a/clip.h +++ b/clip.h @@ -1,205 +1,106 @@ #ifndef CLIP_H #define CLIP_H -#include "llama.cpp/ggml.h" -#include -#include -#include -#include - -// TODO: make the API in C -// #ifdef __cplusplus -// extern "C" { -// #endif - -// default hparams for text_model (ViT-B/32) -struct clip_text_hparams { - int32_t n_vocab = 49408; - int32_t num_positions = 77; - int32_t hidden_size = 512; - int32_t n_intermediate = 2048; - int32_t projection_dim = 512; - int32_t n_head = 8; - int32_t n_layer = 12; -}; - -// default hparams for vision_model (ViT-B/32) -struct clip_vision_hparams { - int32_t image_size = 224; - int32_t patch_size = 32; - int32_t hidden_size = 768; - int32_t n_intermediate = 3072; - int32_t projection_dim = 512; - int32_t n_head = 12; - int32_t n_layer = 12; -}; - -// -// Vocab utils -// - -std::string trim(const std::string & s); - -std::string replace(const std::string & s, const std::string & from, const std::string & to); +#include "ggml.h" -struct clip_vocab { - using id = int32_t; - using token = std::string; +struct clip_ctx; - std::map token_to_id; - std::map id_to_token; - std::vector special_tokens; +#ifdef __cplusplus +extern "C" { +#endif - void add_special_token(const std::string & token); -}; - -std::string convert_to_utf8(const std::wstring & input); - -std::wstring convert_to_wstring(const std::string & input); - -// split text into tokens -// -// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 -// -// Regex (Python): -// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" -// -// Regex (C++): -// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)" -// - -struct clip_layer { - // attention - struct ggml_tensor * k_w; - struct ggml_tensor * k_b; - struct ggml_tensor * q_w; - struct ggml_tensor * q_b; - struct ggml_tensor * v_w; - struct ggml_tensor * v_b; - - struct ggml_tensor * o_w; - struct ggml_tensor * o_b; - - // layernorm 1 - struct ggml_tensor * ln_1_w; - struct ggml_tensor * ln_1_b; - - // ff - struct ggml_tensor * ff_i_w; - struct ggml_tensor * ff_i_b; - - struct ggml_tensor * ff_o_w; - struct ggml_tensor * ff_o_b; - - // layernorm 2 - struct ggml_tensor * ln_2_w; - struct ggml_tensor * ln_2_b; +struct clip_text_hparams { + int32_t n_vocab; + int32_t num_positions; + int32_t hidden_size; + int32_t n_intermediate; + int32_t projection_dim; + int32_t n_head; + int32_t n_layer; + float eps; }; -struct clip_text_model { - clip_text_hparams hparams; - - // embeddings - struct ggml_tensor * token_embeddings; - struct ggml_tensor * position_embeddings; - - std::vector layers; - - struct ggml_tensor * post_ln_w; - struct ggml_tensor * post_ln_b; - - struct ggml_tensor * projection; - - std::map tensors; +struct clip_vision_hparams { + int32_t image_size; + int32_t patch_size; + int32_t hidden_size; + int32_t n_intermediate; + int32_t projection_dim; + int32_t n_head; + int32_t n_layer; + float eps; }; -struct clip_vision_model { - clip_vision_hparams hparams; - - // embeddings - struct ggml_tensor * class_embedding; - struct ggml_tensor * patch_embeddings; - struct ggml_tensor * position_embeddings; - - struct ggml_tensor * pre_ln_w; - struct ggml_tensor * pre_ln_b; - - std::vector layers; - - struct ggml_tensor * post_ln_w; - struct ggml_tensor * post_ln_b; - - struct ggml_tensor * projection; - - std::map tensors; +typedef int32_t clip_vocab_id; +struct clip_tokens { + clip_vocab_id * data; + size_t size; }; struct clip_ctx * clip_model_load(const char * fname, const int verbosity); -// Replacement for std::vector that doesn't require zero-initialization. -struct clip_buffer { - uint8_t * data = NULL; - size_t size = 0; - - void resize(size_t size) { - delete[] data; - data = new uint8_t[size]; - this->size = size; - } +void clip_free(struct clip_ctx * ctx); - ~clip_buffer() { delete[] data; } -}; - -struct clip_ctx { - clip_text_model text_model; - clip_vision_model vision_model; - clip_vocab vocab; - int32_t use_gelu = 0; - int32_t ftype = 1; - ggml_context * ctx; - clip_buffer buf_compute; -}; - -void clip_free(clip_ctx * ctx); +struct clip_text_hparams * clip_get_text_hparams(struct clip_ctx * ctx); +struct clip_vision_hparams * clip_get_vision_hparams(struct clip_ctx * ctx); // RGB uint8 image struct clip_image_u8 { int nx; int ny; - - std::vector data; + uint8_t * data; + size_t size; }; -// RGB float32 image +// RGB float32 image (NHWC) // Memory layout: RGBRGBRGB... struct clip_image_f32 { int nx; int ny; + float * data; + size_t size; +}; + +struct clip_image_u8_batch { + struct clip_image_u8 * data; + size_t size; +}; - std::vector data; +struct clip_image_f32_batch { + struct clip_image_f32 * data; + size_t size; }; -std::vector clip_tokenize(const clip_ctx * ctx, const std::string & text); +bool clip_tokenize(const struct clip_ctx * ctx, const char * text, struct clip_tokens * tokens); -bool clip_image_load_from_file(const std::string & fname, clip_image_u8 & img); -bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res); -void clip_image_batch_preprocess(const clip_ctx * ctx, const int n_threads, const std::vector & img_inputs, - std::vector & img_resized); +struct clip_image_u8 * make_clip_image_u8(); +struct clip_image_f32 * make_clip_image_f32(); +bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); +bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res); -bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector & tokens, float * vec); +bool clip_text_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_tokens * tokens, float * vec, + const bool normalize); +bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec, + const bool normalize); -bool clip_image_encode(const clip_ctx * ctx, int n_threads, const clip_image_f32 & img, float * vec); +void clip_image_batch_preprocess(const struct clip_ctx * ctx, const int n_threads, + const struct clip_image_u8_batch * img_inputs, struct clip_image_f32_batch * imgs_resized); +bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs, + float * vec, const bool normalize); -// bool image_normalize(clip_image_u8 *img, clip_image_f32 *res); +// bool image_normalize(const clip_image_u8 *img, clip_image_f32 *res); -bool clip_compare_text_and_image(clip_ctx * ctx, int n_threads, std::string & text, clip_image_u8 & image, float * score); -float clip_similarity_score(float * vec1, float * vec2, int vec_dim); -bool softmax_with_sorting(float * arr, int length, float * sorted_scores, int * indices); +bool clip_compare_text_and_image(const struct clip_ctx * ctx, const int n_threads, const char * text, + const struct clip_image_u8 * image, float * score); +float clip_similarity_score(const float * vec1, const float * vec2, const int vec_dim); +bool softmax_with_sorting(float * arr, const int length, float * sorted_scores, int * indices); +bool clip_zero_shot_label_image(struct clip_ctx * ctx, const int n_threads, const struct clip_image_u8 * input_img, + const char ** labels, const size_t n_labels, float * scores, int * indices); -bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vector & imgs, float * vec); +bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype); -// #ifdef __cplusplus -// } -// #endif +#ifdef __cplusplus +} +#endif -#endif // CLIP_H \ No newline at end of file +#endif // CLIP_H diff --git a/llama.cpp b/llama.cpp index 294f424..40e07a6 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 294f424554c1599784ac9962462fc39ace92d8a5 +Subproject commit 40e07a60f9ce06e79f3ccd4c903eba300fb31b5e diff --git a/main.cpp b/main.cpp index 4b8de59..0904b59 100644 --- a/main.cpp +++ b/main.cpp @@ -1,7 +1,11 @@ #include "clip.h" -#include "llama.h" -int main() { - printf("Hello from LLava.cpp\n"); +int main(int argc, char ** argv) { + const char * path = argv[1]; + + auto p_ctx = clip_model_load(path, 1); + + clip_free(p_ctx); + return 0; -} \ No newline at end of file +} diff --git a/models/convert_hf_to_gguf.py b/models/convert_hf_to_gguf.py new file mode 100644 index 0000000..2d1a47c --- /dev/null +++ b/models/convert_hf_to_gguf.py @@ -0,0 +1,240 @@ +import argparse +import os +import json + +import torch +import numpy as np +from gguf import * +from transformers import CLIPModel, CLIPProcessor + +TEXT = "clip.text" +VISION = "clip.vision" + +def k(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): + return True + + if name == "visual_projection.weight" and has_llava: + return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: + return True + + return False + +def get_tensor_name(name: str) -> str: + if "projection" in name: + return name + + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") + + +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + +ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py") +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +ap.add_argument("--text-only", action="store_true", required=False, help="Save a text-only model. It can't be used to encode images") +ap.add_argument("--vision-only", action="store_true", required=False, help="Save a vision-only model. It can't be used to encode texts") +ap.add_argument("--llava-projector", help="Path to projector.pt file. If specified, save an image encoder for LLaVA models.") +ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values") +ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values") +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) + +args = ap.parse_args() + + +if args.text_only and args.vision_only: + print("--text-only and --image-only arguments cannot be specified at the same time.") + exit(1) + +if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + +# output in the same directory as the model if output_dir is None +dir_model = args.model_dir + + +with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + vocab = json.load(f) + tokens = [key for key in vocab] + +with open(dir_model + "/config.json", "r", encoding="utf-8") as f: + config = json.load(f) + v_hparams = config["vision_config"] + t_hparams = config["text_config"] + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if args.use_f32: + ftype = 0 + + +model = CLIPModel.from_pretrained(dir_model) +processor = CLIPProcessor.from_pretrained(dir_model) + +fname_middle = None +has_text_encoder = True +has_vision_encoder = True +has_llava_projector = False +if args.text_only: + fname_middle = "text-" + has_vision_encoder = False +elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False +elif args.llava_projector is not None: + fname_middle = "mmproj-" + has_text_encoder = False + has_llava_projector = True +else: + fname_middle = "" + +output_dir = args.output_dir if args.output_dir is not None else dir_model +os.makedirs(output_dir, exist_ok=True) +output_prefix = os.path.basename(output_dir).replace("ggml_", "") +fname_out = os.path.join(output_dir, f"{output_prefix}_ggml-{fname_middle}model-{ftype_str[ftype]}.gguf") +fout = GGUFWriter(path=fname_out, arch="clip") + +fout.add_bool("clip.has_text_encoder", has_text_encoder) +fout.add_bool("clip.has_vision_encoder", has_vision_encoder) +fout.add_bool("clip.has_llava_projector", has_llava_projector) +fout.add_file_type(ftype) +model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model) +fout.add_name(model_name) +if args.text_only: + fout.add_description("text-only CLIP model") +elif args.vision_only and not has_llava_projector: + fout.add_description("vision-only CLIP model") +elif has_llava_projector: + fout.add_description("image encoder for LLaVA") +else: + fout.add_description("two-tower CLIP model") + +if has_text_encoder: + # text_model hparams + fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"]) + fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"])) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"]) + fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"]) + fout.add_token_list(tokens) + +if has_vision_encoder: + # vision_model hparams + fout.add_uint32("clip.vision.image_size", v_hparams["image_size"]) + fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"]) + fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"])) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"]) + block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) + + image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean + image_std = processor.image_processor.image_std if args.image_std is None else args.image_std + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) + +use_gelu = v_hparams["hidden_act"] == "gelu" +fout.add_bool("clip.use_gelu", use_gelu) + + + + +if has_llava_projector: + model.vision_model.encoder.layers.pop(-1) + projector = torch.load(args.llava_projector) + weight = projector["model.mm_projector.weight"].cpu().squeeze().float().numpy().astype(np.float16) + bias = projector['model.mm_projector.bias'].cpu().squeeze().float().numpy().astype(np.float32) + fout.add_tensor("llava_projector.weight", weight) + fout.add_tensor("llava_projector.bias", bias) + print("Projector tensors added\n") + + +list_vars = model.state_dict() +for name, data in list_vars.items(): + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector): + # we don't need this + print(f"skipping parameter: {name}") + continue + + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) + + + +fout.write_header_to_file() +fout.write_kv_data_to_file() +fout.write_tensors_to_file() +fout.close() + +print("Done. Output file: " + fname_out) diff --git a/models/llava_surgery.py b/models/llava_surgery.py new file mode 100644 index 0000000..a97cc06 --- /dev/null +++ b/models/llava_surgery.py @@ -0,0 +1,63 @@ +import argparse +from llava.model import LlavaLlamaForCausalLM +from transformers import AutoTokenizer +from peft import PeftModel +import torch + +dtype = torch.bfloat16 + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", help="Path to LLaVA RLHF model") +ap.add_argument("-o", "--output", help="Output directory to save the merged file") +args = ap.parse_args() + +model_path = f"{args.model}/sft_model" +lora_path = f"{args.model}/rlhf_lora_adapter_model" +save_path = args.output + +model = LlavaLlamaForCausalLM.from_pretrained( + model_path, + device_map={"": "cuda:0"}, + torch_dtype=dtype, +) +model = PeftModel.from_pretrained( + model, + lora_path, +) + + +model = model.merge_and_unload() + +model.save_pretrained(save_path) + +tokenizer = AutoTokenizer.from_pretrained(model_path) +tokenizer.save_pretrained(save_path) + +del model +del tokenizer + + +# Load the checkpoint +checkpoint = torch.load(f"{save_path}/pytorch_model-00002-of-00002.bin") + +# Extract the tensors we want +mm_projector_weight = checkpoint['model.mm_projector.weight'] +mm_projector_bias = checkpoint['model.mm_projector.bias'] + +# Remove the tensors from the checkpoint +del checkpoint['model.mm_projector.weight'] +del checkpoint['model.mm_projector.bias'] + +# Create a dictionary with the original names as keys +mm_projector = { + 'model.mm_projector.weight': mm_projector_weight, + 'model.mm_projector.bias': mm_projector_bias +} + +# Save the combined dictionary using torch.save +torch.save(mm_projector, "projector.pt") + +# Save the rest of the model with the same original name +torch.save(checkpoint, "./llava-7b-rlhf-merged/pytorch_model-00002-of-00002.bin") + +Print("Operation complete!") diff --git a/models/requirements.txt b/models/requirements.txt new file mode 100644 index 0000000..a753aab --- /dev/null +++ b/models/requirements.txt @@ -0,0 +1,4 @@ +torch +transformers +peft +gguf