diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b82da9..79ea3c5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,21 +1,22 @@
 cmake_minimum_required(VERSION 3.12)
-project(llava.cpp)
+project(lmm.cpp)
 
 # Add "llama.cpp" submodule
-add_subdirectory(llama.cpp)
+add_subdirectory(llama.cpp llama.cpp/common)
 
 # Add "clip.cpp" library
 add_library(clip clip.cpp clip.h)
 target_link_libraries(clip PRIVATE ggml)
 
-# Add target executable/library for llava.cpp
-add_executable(llava main.cpp)
+# Add target executable/library for main
+add_executable(main main.cpp)
 
 # Link against clip and llama libs
-target_link_libraries(llava clip llama)
+target_link_libraries(main clip llama common)
 
-# Include directories for llava
-target_include_directories(llava PRIVATE
+# Include directories for lmm
+target_include_directories(main PRIVATE
     .
     ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp/common
 )
diff --git a/clip.cpp b/clip.cpp
index ddd5d64..4a13ad8 100644
--- a/clip.cpp
+++ b/clip.cpp
@@ -4,68 +4,623 @@
 #include <cstring>
 #include <fstream>
 #include <iostream>
+#include <map>
 #include <pthread.h>
 #include <regex>
+#include <stdexcept>
+#include <thread>
+#include <vector>
 
 #include "clip.h"
-#include "llama.cpp/ggml.h"
+#include "ggml.h"
 
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 
 // #define CLIP_DEBUG
 
+static std::string format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), buf.size());
+}
+
+//
+// key constants
+//
+
+#define KEY_FTYPE "general.file_type"
+#define KEY_NAME "general.name"
+#define KEY_DESCRIPTION "general.description"
+#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
+#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
+#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
+#define KEY_USE_GELU "clip.use_gelu"
+#define KEY_N_EMBD "clip.%s.embedding_length"
+#define KEY_N_FF "clip.%s.feed_forward_length"
+#define KEY_N_BLOCK "clip.%s.block_count"
+#define KEY_N_HEAD "clip.%s.attention.head_count"
+#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
+#define KEY_PROJ_DIM "clip.%s.projection_dim"
+#define KEY_TOKENS "tokenizer.ggml.tokens"
+#define KEY_N_POSITIONS "clip.text.context_length"
+#define KEY_IMAGE_SIZE "clip.vision.image_size"
+#define KEY_PATCH_SIZE "clip.vision.patch_size"
+#define KEY_IMAGE_MEAN "clip.vision.image_mean"
+#define KEY_IMAGE_STD "clip.vision.image_std"
+
+//
+// tensor name constants
+//
+
+#define TN_TOKEN_EMBD "%s.token_embd.weight"
+#define TN_POS_EMBD "%s.position_embd.weight"
+#define TN_CLASS_EMBD "v.class_embd"
+#define TN_PATCH_EMBD "v.patch_embd.weight"
+#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
+#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
+#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
+#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
+#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
+#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
+#define TN_LN_1 "%s.blk.%d.ln1.%s"
+#define TN_LN_2 "%s.blk.%d.ln2.%s"
+#define TN_LN_PRE "%s.pre_ln.%s"
+#define TN_LN_POST "%s.post_ln.%s"
+#define TN_TEXT_PROJ "text_projection.weight"
+#define TN_VIS_PROJ "visual_projection.weight"
+#define TN_LLAVA_PROJ "llava_projector.%s"
+
+//
+// utilities to get data from a gguf file
+//
+
+int get_key_idx(const gguf_context * ctx, const char * key) {
+    int i = gguf_find_key(ctx, key);
+    if (i == -1) {
+        fprintf(stderr, "key %s not found in file\n", key);
+        throw std::runtime_error(format("Missing required key: %s", key));
+    }
+
+    return i;
+}
+
+const uint32_t get_u32(const gguf_context * ctx, std::string key) {
+    const int i = get_key_idx(ctx, key.c_str());
+
+    return gguf_get_val_u32(ctx, i);
+}
+
+const float get_f32(const gguf_context * ctx, std::string key) {
+    const int i = get_key_idx(ctx, key.c_str());
+
+    return gguf_get_val_f32(ctx, i);
+}
+
+struct ggml_tensor * get_tensor(struct ggml_context * ctx, std::string name) {
+    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
+    if (!cur) {
+        printf("unable to find tensor %s\n", name.c_str());
+        throw std::runtime_error(format("unable to find tensor %s\n", name.c_str()));
+    }
+
+    return cur;
+}
+
+std::string get_ftype(int ftype) {
+    switch (ftype) {
+    case 0:
+        return "f32";
+        break;
+    case 1:
+        return "f16";
+        break;
+    case 2:
+        return "q4_0";
+        break;
+    case 3:
+        return "q4_1";
+        break;
+    case 6:
+        return "q5_0";
+        break;
+    case 7:
+        return "q5_1";
+        break;
+    case 8:
+        return "q8_0";
+        break;
+    default:
+        throw std::runtime_error(format("Unrecognized file type: %d\n", ftype));
+    }
+}
+
+//
+// Vocab utils
+//
+
+struct clip_vocab {
+    using id = clip_vocab_id;
+    using token = std::string;
+
+    std::map<token, id> token_to_id;
+    std::map<id, token> id_to_token;
+    std::vector<std::string> special_tokens;
+
+    //    void add_special_token(const std::string & token);
+};
+
+//
+// clip layers
+//
+
+struct clip_layer {
+    // attention
+    struct ggml_tensor * k_w;
+    struct ggml_tensor * k_b;
+    struct ggml_tensor * q_w;
+    struct ggml_tensor * q_b;
+    struct ggml_tensor * v_w;
+    struct ggml_tensor * v_b;
+
+    struct ggml_tensor * o_w;
+    struct ggml_tensor * o_b;
+
+    // layernorm 1
+    struct ggml_tensor * ln_1_w;
+    struct ggml_tensor * ln_1_b;
+
+    // ff
+    struct ggml_tensor * ff_i_w;
+    struct ggml_tensor * ff_i_b;
+
+    struct ggml_tensor * ff_o_w;
+    struct ggml_tensor * ff_o_b;
+
+    // layernorm 2
+    struct ggml_tensor * ln_2_w;
+    struct ggml_tensor * ln_2_b;
+};
+
+struct clip_text_model {
+    struct clip_text_hparams hparams;
+
+    // embeddings
+    struct ggml_tensor * token_embeddings;
+    struct ggml_tensor * position_embeddings;
+
+    std::vector<clip_layer> layers;
+
+    struct ggml_tensor * post_ln_w;
+    struct ggml_tensor * post_ln_b;
+
+    struct ggml_tensor * projection;
+};
+
+struct clip_vision_model {
+    struct clip_vision_hparams hparams;
+
+    // embeddings
+    struct ggml_tensor * class_embedding;
+    struct ggml_tensor * patch_embeddings;
+    struct ggml_tensor * position_embeddings;
+
+    struct ggml_tensor * pre_ln_w;
+    struct ggml_tensor * pre_ln_b;
+
+    std::vector<clip_layer> layers;
+
+    struct ggml_tensor * post_ln_w;
+    struct ggml_tensor * post_ln_b;
+
+    struct ggml_tensor * projection;
+
+    // LLaVA projection
+    struct ggml_tensor * llava_proj_w;
+    struct ggml_tensor * llava_proj_b;
+};
+
+// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
+struct clip_buffer {
+    uint8_t * data = NULL;
+    size_t size = 0;
+
+    void resize(size_t size) {
+        delete[] data;
+        data = new uint8_t[size];
+        this->size = size;
+    }
+
+    ~clip_buffer() { delete[] data; }
+};
+
+struct clip_ctx {
+    bool has_text_encoder = false;
+    bool has_vision_encoder = false;
+    bool has_llava_projector = false;
+    struct clip_text_model text_model;
+    struct clip_vision_model vision_model;
+    struct clip_vocab vocab;
+    float image_mean[3];
+    float image_std[3];
+    bool use_gelu = false;
+    int32_t ftype = 1;
+    struct ggml_context * ctx;
+    struct gguf_context * ctx_gguf;
+    struct clip_buffer buf_compute;
+};
+
+//
+// memory allocation and management
+//
+
 // utility function for a workaround until https://github.com/ggerganov/ggml/issues/260 is resolved
 // after that, remove this and use the mechanism implemented in GGML directly
-size_t get_mem_req_by_size(const size_t n_tensors, const int n_image_positions) {
+size_t get_mem_req_by_size(struct clip_ctx * ctx) {
     size_t mb = 1024 * 1024;
+    const int n_tensors = gguf_get_n_tensors(ctx->ctx_gguf);
+    const auto & vision_hparams = clip_get_vision_hparams(ctx);
+    const int n_positions =
+        ctx->has_vision_encoder ? vision_hparams->image_size * vision_hparams->image_size / vision_hparams->patch_size + 1 : 77;
     switch (n_tensors) {
-    case 397:                          // base
-        if (n_image_positions == 50) { // patch size = 32
+    case 397:                    // base, two-tower
+    case 200:                    // base, vision-only
+        if (n_positions == 50) { // patch size = 32
             return 12 * mb;
         } else { // patch size = 16
             return 24 * mb;
         }
-    case 589:                           // large
-        if (n_image_positions == 257) { // input image size = 224
+    case 197: // base or large, text-only
+        return 12 * mb;
+    case 589:                     // large, two-tower
+    case 392:                     // large, vision-only
+    case 377:                     // large, LLaVA encoder
+        if (n_positions == 257) { // input image size = 224
             return 24 * mb;
         } else { // input image size = 336
             return 60 * mb;
         }
-    case 909: // huge
+    case 909: // huge, two-tower
+    case 520: // huge, vision-only
         return 232 * mb;
+    case 389: // huge, text-only
+        return 120 * mb;
     default:
-        fprintf(stderr, "%s: Unrecognized number of tensors: %zu. Check if you pass the correct model file\n", __func__,
+        fprintf(stderr, "%s: Unrecognized number of tensors: %d. Check if you pass the correct model file\n", __func__,
                 n_tensors);
         exit(1);
     }
 }
 
-size_t get_scr_buf_req_by_size(const size_t n_tensors, const int n_positions) {
+size_t get_scr_buf_req_by_size(struct clip_ctx * ctx) {
     size_t mb = 1024 * 1024;
+
+    const int n_tensors = gguf_get_n_tensors(ctx->ctx_gguf);
+    const auto & vision_hparams = clip_get_vision_hparams(ctx);
+    const int n_positions =
+        ctx->has_vision_encoder ? vision_hparams->image_size * vision_hparams->image_size / vision_hparams->patch_size + 1 : 77;
+
     switch (n_tensors) {
     case 397:
+    case 200:
         if (n_positions <= 50) {
             return 32 * mb;
         } else {
             return 96 * mb;
         }
+    case 197:
+        return 32 * mb;
     case 589:
+    case 392:
+    case 377:
         if (n_positions <= 257) {
             return 96 * mb;
         } else {
             return 192 * mb;
         }
     case 909:
+    case 520:
         return 144 * mb;
+    case 389:
+        return 60 * mb;
     default:
-        fprintf(stderr, "%s: Unrecognized number of tensors: %zu. Check if you pass the correct model file\n", __func__,
+        fprintf(stderr, "%s: Unrecognized number of tensors: %d. Check if you pass the correct model file\n", __func__,
                 n_tensors);
         exit(1);
     }
 }
 
-std::vector<clip_vocab::id> clip_tokenize(const clip_ctx * ctx, const std::string & text) {
+// read and create ggml_context containing the tensors and their data
+struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+
+    struct ggml_context * meta = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ &meta,
+    };
+
+    struct gguf_context * ctx = gguf_init_from_file(fname, params);
+
+    if (verbosity >= 1) {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+        const int n_kv = gguf_get_n_kv(ctx);
+        const int ftype = get_u32(ctx, KEY_FTYPE);
+        const std::string ftype_str = get_ftype(ftype);
+        const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION);
+        const std::string description = gguf_get_val_str(ctx, idx_desc);
+        const int idx_name = gguf_find_key(ctx, KEY_NAME);
+        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
+            const std::string name = gguf_get_val_str(ctx, idx_name);
+            printf("%s: model name:   %s\n", __func__, name.c_str());
+        }
+        printf("%s: description:  %s\n", __func__, description.c_str());
+        printf("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
+        printf("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
+        printf("%s: n_tensors:    %d\n", __func__, n_tensors);
+        printf("%s: n_kv:         %d\n", __func__, n_kv);
+        printf("%s: ftype:        %s\n", __func__, ftype_str.c_str());
+        printf("\n");
+    }
+
+    // kv
+    if (verbosity >= 3) {
+        const int n_kv = gguf_get_n_kv(ctx);
+
+        for (int i = 0; i < n_kv; ++i) {
+            const char * key = gguf_get_key(ctx, i);
+
+            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+        }
+        printf("\n");
+    }
+
+    // data
+    size_t ctx_size = 0;
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name = gguf_get_tensor_name(ctx, i);
+            const size_t offset = gguf_get_tensor_offset(ctx, i);
+
+            struct ggml_tensor * cur = ggml_get_tensor(meta, name);
+            ctx_size += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
+            size_t tensor_size = ggml_nbytes(cur);
+            size_t padded_size = ggml_nbytes_pad(cur);
+            ctx_size += padded_size;
+            if (verbosity >= 3) {
+                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
+                       cur->n_dims, cur->name, tensor_size, padded_size, offset);
+            }
+        }
+    }
+
+    clip_ctx * new_clip = new clip_ctx;
+
+    // model size and capabilities
+    {
+        int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC);
+        new_clip->has_text_encoder = gguf_get_val_bool(ctx, idx);
+
+        idx = get_key_idx(ctx, KEY_HAS_VIS_ENC);
+        new_clip->has_vision_encoder = gguf_get_val_bool(ctx, idx);
+
+        idx = gguf_find_key(ctx, KEY_HAS_LLAVA_PROJ);
+        if (idx != -1) {
+            new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
+        }
+
+        idx = get_key_idx(ctx, KEY_USE_GELU);
+        new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
+
+        if (verbosity >= 1) {
+            printf("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
+            printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
+            printf("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
+            printf("%s: model size:     %.2f MB\n", __func__, (ctx_size / 1024.0 / 1024.0));
+            printf("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
+        }
+    }
+
+    // load tensors
+    {
+        struct ggml_init_params params = {
+            .mem_size = ctx_size,
+            .mem_buffer = NULL,
+            .no_alloc = false,
+        };
+
+        new_clip->ctx = ggml_init(params);
+        if (!new_clip->ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            clip_free(new_clip);
+            return nullptr;
+        }
+
+        auto fin = std::ifstream(fname, std::ios::binary);
+        if (!fin) {
+            printf("cannot open model file for loading tensors\n");
+            clip_free(new_clip);
+            return nullptr;
+        }
+
+        const int n_tensors = gguf_get_n_tensors(ctx);
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name = gguf_get_tensor_name(ctx, i);
+            struct ggml_tensor * t = ggml_get_tensor(meta, name);
+            struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t);
+            ggml_set_name(cur, name);
+
+            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
+            fin.seekg(offset, std::ios::beg);
+            if (!fin) {
+                printf("%s: failed to seek for tensor %s\n", __func__, name);
+                clip_free(new_clip);
+                return nullptr;
+            }
+
+            fin.read(reinterpret_cast<char *>(cur->data), ggml_nbytes(t));
+        }
+
+        fin.close();
+    }
+
+    // text model
+    if (new_clip->has_text_encoder) {
+        // load text model
+        auto & text_model = new_clip->text_model;
+        auto & hparams = text_model.hparams;
+        hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "text"));
+        hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "text"));
+        hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "text"));
+        hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "text"));
+        hparams.num_positions = get_u32(ctx, KEY_N_POSITIONS);
+        hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "text"));
+        hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "text"));
+
+        const int idx_tokens = get_key_idx(ctx, KEY_TOKENS);
+        hparams.n_vocab = gguf_get_arr_n(ctx, idx_tokens);
+        auto & vocab = new_clip->vocab;
+        for (int id = 0; id < hparams.n_vocab; ++id) {
+            const std::string token = gguf_get_arr_str(ctx, idx_tokens, id);
+            vocab.id_to_token[id] = token;
+            vocab.token_to_id[token] = id;
+        }
+
+        if (verbosity >= 2) {
+            printf("\n%s: text model hparams\n", __func__);
+            printf("n_vocab            %d\n", hparams.n_vocab);
+            printf("num_positions      %d\n", hparams.num_positions);
+            printf("t_hidden_size      %d\n", hparams.hidden_size);
+            printf("t_n_intermediate   %d\n", hparams.n_intermediate);
+            printf("t_projection_dim   %d\n", hparams.projection_dim);
+            printf("t_n_head           %d\n", hparams.n_head);
+            printf("t_n_layer          %d\n", hparams.n_layer);
+        }
+
+        text_model.token_embeddings = get_tensor(new_clip->ctx, format(TN_TOKEN_EMBD, "t"));
+        text_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "t"));
+        text_model.post_ln_w = get_tensor(new_clip->ctx, format(TN_LN_POST, "t", "weight"));
+        text_model.post_ln_b = get_tensor(new_clip->ctx, format(TN_LN_POST, "t", "bias"));
+        text_model.projection = get_tensor(new_clip->ctx, TN_TEXT_PROJ);
+        text_model.layers.resize(hparams.n_layer);
+        for (int il = 0; il < hparams.n_layer; ++il) {
+            auto & layer = text_model.layers[il];
+            layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "t", il, "weight"));
+            layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "t", il, "weight"));
+            layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "t", il, "weight"));
+            layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "t", il, "weight"));
+            layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "t", il, "weight"));
+            layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "t", il, "weight"));
+            layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "t", il, "weight"));
+            layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "t", il, "weight"));
+            layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "t", il, "bias"));
+            layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "t", il, "bias"));
+            layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "t", il, "bias"));
+            layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "t", il, "bias"));
+            layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "t", il, "bias"));
+            layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "t", il, "bias"));
+            layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "t", il, "bias"));
+            layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "t", il, "bias"));
+        }
+    }
+
+    // vision model
+    if (new_clip->has_vision_encoder) {
+        // load vision model
+        auto & vision_model = new_clip->vision_model;
+        auto & hparams = vision_model.hparams;
+        hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
+        hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
+        hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision"));
+        hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
+        hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
+        hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
+        hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
+        hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
+
+        int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
+        int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
+        for (int i = 0; i < 3; ++i) {
+            new_clip->image_mean[i] = *((float *)gguf_get_arr_data(ctx, idx_mean));
+            new_clip->image_std[i] = *((float *)gguf_get_arr_data(ctx, idx_std));
+        }
+
+        if (verbosity >= 2) {
+            printf("\n%s: vision model hparams\n", __func__);
+            printf("image_size         %d\n", hparams.image_size);
+            printf("patch_size         %d\n", hparams.patch_size);
+            printf("v_hidden_size      %d\n", hparams.hidden_size);
+            printf("v_n_intermediate   %d\n", hparams.n_intermediate);
+            printf("v_projection_dim   %d\n", hparams.projection_dim);
+            printf("v_n_head           %d\n", hparams.n_head);
+            printf("v_n_layer          %d\n", hparams.n_layer);
+        }
+
+        vision_model.patch_embeddings = get_tensor(new_clip->ctx, TN_PATCH_EMBD);
+        vision_model.class_embedding = get_tensor(new_clip->ctx, TN_CLASS_EMBD);
+        vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v"));
+        vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight"));
+        vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias"));
+        vision_model.post_ln_w = get_tensor(new_clip->ctx, format(TN_LN_POST, "v", "weight"));
+        vision_model.post_ln_b = get_tensor(new_clip->ctx, format(TN_LN_POST, "v", "bias"));
+        if (new_clip->has_llava_projector) {
+            vision_model.llava_proj_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, "weight"));
+            vision_model.llava_proj_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, "bias"));
+        } else {
+            vision_model.projection = get_tensor(new_clip->ctx, TN_VIS_PROJ);
+        }
+        vision_model.layers.resize(hparams.n_layer);
+        for (int il = 0; il < hparams.n_layer; ++il) {
+            auto & layer = vision_model.layers[il];
+            layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "weight"));
+            layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "weight"));
+            layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "weight"));
+            layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "weight"));
+            layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "weight"));
+            layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "weight"));
+            layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "weight"));
+            layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "weight"));
+            layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "bias"));
+            layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "bias"));
+            layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "bias"));
+            layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "bias"));
+            layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "bias"));
+            layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "bias"));
+            layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "bias"));
+            layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "bias"));
+        }
+    }
+
+    ggml_free(meta);
+
+    new_clip->ctx_gguf = ctx;
+
+    const size_t mem_req = get_mem_req_by_size(new_clip);
+    new_clip->buf_compute.resize(mem_req);
+    if (verbosity >= 1) {
+        printf("\n%s: %zu MB of memory allocated\n", __func__, mem_req / 1024 / 1024);
+    }
+
+    return new_clip;
+}
+
+bool clip_tokenize(const clip_ctx * ctx, const char * text, struct clip_tokens * tokens) {
+    if (!ctx->has_text_encoder) {
+        printf("This GGUF file seems to have no text encoder\n");
+        return false;
+    }
+
     std::vector<std::string> words;
 
     // first split the text into words
@@ -98,8 +653,8 @@ std::vector<clip_vocab::id> clip_tokenize(const clip_ctx * ctx, const std::strin
         }
     }
 
-    std::vector<clip_vocab::id> tokens;
-    tokens.push_back(49406); // startoftext
+    std::vector<clip_vocab::id> v_tokens;
+    v_tokens.push_back(49406); // startoftext
 
     for (const auto & word : words) {
         // feel lucky? let's try if it's a full word
@@ -113,7 +668,7 @@ std::vector<clip_vocab::id> clip_tokenize(const clip_ctx * ctx, const std::strin
         full_word += "</w>";
         auto wit = ctx->vocab.token_to_id.find(full_word);
         if (wit != ctx->vocab.token_to_id.end()) {
-            tokens.push_back(wit->second);
+            v_tokens.push_back(wit->second);
             continue;
         }
 
@@ -122,7 +677,7 @@ std::vector<clip_vocab::id> clip_tokenize(const clip_ctx * ctx, const std::strin
                 auto cand = word.substr(i, j - i + 1);
                 auto it = ctx->vocab.token_to_id.find(cand);
                 if (it != ctx->vocab.token_to_id.end()) { // word.substr(i, j-i+1) in vocab
-                    tokens.push_back(it->second);
+                    v_tokens.push_back(it->second);
                     i = j + 1;
                     break;
                 } else if (j == i) { // word.substr(i, 1) has no matching
@@ -133,23 +688,33 @@ std::vector<clip_vocab::id> clip_tokenize(const clip_ctx * ctx, const std::strin
         }
     }
 
-    tokens.push_back(49407); // endoftext
+    v_tokens.push_back(49407); // endoftext
 
-    return tokens;
+    tokens->size = v_tokens.size();
+
+    tokens->data = new int[v_tokens.size()];
+    std::copy(v_tokens.begin(), v_tokens.end(), tokens->data);
+
+    return true;
 }
 
-bool clip_image_load_from_file(const std::string & fname, clip_image_u8 & img) {
+clip_image_u8 * make_clip_image_u8() { return new clip_image_u8(); }
+
+clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }
+
+bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
     int nx, ny, nc;
-    auto data = stbi_load(fname.c_str(), &nx, &ny, &nc, 3);
+    auto data = stbi_load(fname, &nx, &ny, &nc, 3);
     if (!data) {
-        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
+        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname);
         return false;
     }
 
-    img.nx = nx;
-    img.ny = ny;
-    img.data.resize(nx * ny * 3);
-    memcpy(img.data.data(), data, nx * ny * 3);
+    img->nx = nx;
+    img->ny = ny;
+    img->size = nx * ny * 3;
+    img->data = new uint8_t[img->size]();
+    memcpy(img->data, data, img->size);
 
     stbi_image_free(data);
 
@@ -159,6 +724,11 @@ bool clip_image_load_from_file(const std::string & fname, clip_image_u8 & img) {
 // normalize: x = (x - mean) / std
 // TODO: implement bicubic interpolation instead of linear.
 bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res) {
+    if (!ctx->has_vision_encoder) {
+        printf("This gguf file seems to have no vision encoder\n");
+        return false;
+    }
+
     const int nx = img->nx;
     const int ny = img->ny;
 
@@ -167,15 +737,16 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
 
     res->nx = nx2;
     res->ny = ny2;
-    res->data.resize(3 * nx2 * ny2);
+    res->size = 3 * nx2 * ny2;
+    res->data = new float[res->size]();
 
     const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
 
     const int nx3 = int(nx / scale + 0.5f);
     const int ny3 = int(ny / scale + 0.5f);
 
-    const float m3[3] = {0.48145466f, 0.4578275f, 0.40821073f};
-    const float s3[3] = {0.26862954f, 0.26130258f, 0.27577711f};
+    const auto & m3 = ctx->image_mean; // {0.48145466f, 0.4578275f, 0.40821073f};
+    const auto & s3 = ctx->image_std;  // {0.26862954f, 0.26130258f, 0.27577711f};
 
     for (int y = 0; y < ny3; y++) {
         for (int x = 0; x < nx3; x++) {
@@ -219,6 +790,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
 
     return true;
 }
+
 // Structure to hold the image data as an input to function to be executed for thread
 typedef struct {
     const clip_image_u8 * input;
@@ -240,34 +812,35 @@ void * preprocess_image(void * arg) {
 }
 
 // Function to batch-preprocess multiple images i
-void clip_image_batch_preprocess(const clip_ctx * ctx, const int n_threads, const std::vector<clip_image_u8> & img_inputs,
-                                 std::vector<clip_image_f32> & imgs_resized) {
-    GGML_ASSERT(img_inputs.size() == imgs_resized.size());
-    int num_threads = std::min(n_threads, static_cast<int>(img_inputs.size()));
+void clip_image_batch_preprocess(const clip_ctx * ctx, const int n_threads, const clip_image_u8_batch * img_inputs,
+                                 clip_image_f32_batch * imgs_resized) {
+    imgs_resized->size = img_inputs->size;
+
+    int num_threads = std::min(n_threads, static_cast<int>(img_inputs->size));
     int i, t;
 
     // Divide the images among the threads
-    int images_per_thread = img_inputs.size() / num_threads;
+    int images_per_thread = img_inputs->size / num_threads;
 
     if (num_threads == 1) {
         // Single-threaded case
-        for (i = 0; i < img_inputs.size(); i++) {
-            clip_image_preprocess(ctx, &img_inputs[i], &imgs_resized[i]);
+        for (i = 0; i < img_inputs->size; i++) {
+            clip_image_preprocess(ctx, &img_inputs->data[i], &imgs_resized->data[i]);
         }
     } else {
         // Multi-threaded case
 
         std::vector<pthread_t> threads(num_threads);
-        std::vector<ImageData> imageData(img_inputs.size());
+        std::vector<ImageData> imageData(img_inputs->size);
 
         for (t = 0; t < num_threads; t++) {
             int start_index = t * images_per_thread;
-            int end_index = (t == num_threads - 1) ? img_inputs.size() : start_index + images_per_thread;
+            int end_index = (t == num_threads - 1) ? img_inputs->size : start_index + images_per_thread;
 
             // Create ImageData for each thread
             for (i = start_index; i < end_index; i++) {
-                imageData[i].input = &img_inputs[i];
-                imageData[i].resized = &imgs_resized[i];
+                imageData[i].input = &img_inputs->data[i];
+                imageData[i].resized = &imgs_resized->data[i];
                 imageData[i].ctx = ctx;
             }
 
@@ -282,537 +855,22 @@ void clip_image_batch_preprocess(const clip_ctx * ctx, const int n_threads, cons
     }
 }
 
-struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
-    if (verbosity >= 1) {
-        printf("%s: loading model from '%s' - please wait...", __func__, fname);
-    }
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname);
-        return nullptr;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *)&magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname);
-            return nullptr;
-        }
-    }
-
-    clip_ctx * new_clip = new clip_ctx;
-    clip_text_model & text_model = new_clip->text_model;
-    clip_vision_model & vision_model = new_clip->vision_model;
-    clip_vocab & vocab = new_clip->vocab;
-
-    // load hparams for text
-    {
-        auto & hparams = text_model.hparams;
-
-        fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *)&hparams.num_positions, sizeof(hparams.num_positions));
-        fin.read((char *)&hparams.hidden_size, sizeof(hparams.hidden_size));
-        fin.read((char *)&hparams.n_intermediate, sizeof(hparams.n_intermediate));
-        fin.read((char *)&hparams.projection_dim, sizeof(hparams.projection_dim));
-        fin.read((char *)&hparams.n_head, sizeof(hparams.n_head));
-        fin.read((char *)&hparams.n_layer, sizeof(hparams.n_layer));
-
-        if (verbosity >= 2) {
-            printf("\n%s: text model hparams\n", __func__);
-            printf("n_vocab            %d\n", hparams.n_vocab);
-            printf("num_positions      %d\n", hparams.num_positions);
-            printf("t_hidden_size      %d\n", hparams.hidden_size);
-            printf("t_n_intermediate   %d\n", hparams.n_intermediate);
-            printf("t_projection_dim   %d\n", hparams.projection_dim);
-            printf("t_n_head           %d\n", hparams.n_head);
-            printf("t_n_layer          %d\n", hparams.n_layer);
-        }
-    }
-
-    // load hparams for vision
-    {
-        auto & hparams = vision_model.hparams;
-
-        fin.read((char *)&hparams.image_size, sizeof(hparams.image_size));
-        fin.read((char *)&hparams.patch_size, sizeof(hparams.patch_size));
-        fin.read((char *)&hparams.hidden_size, sizeof(hparams.hidden_size));
-        fin.read((char *)&hparams.n_intermediate, sizeof(hparams.n_intermediate));
-        fin.read((char *)&hparams.projection_dim, sizeof(hparams.projection_dim));
-        fin.read((char *)&hparams.n_head, sizeof(hparams.n_head));
-        fin.read((char *)&hparams.n_layer, sizeof(hparams.n_layer));
-
-        fin.read((char *)&new_clip->use_gelu, sizeof(new_clip->use_gelu));
-        fin.read((char *)&new_clip->ftype, sizeof(new_clip->ftype));
-
-        if (verbosity >= 2) {
-            printf("\n%s: vision model hparams\n", __func__);
-            printf("image_size         %d\n", hparams.image_size);
-            printf("patch_size         %d\n", hparams.patch_size);
-            printf("v_hidden_size      %d\n", hparams.hidden_size);
-            printf("v_n_intermediate   %d\n", hparams.n_intermediate);
-            printf("v_projection_dim   %d\n", hparams.projection_dim);
-            printf("v_n_head           %d\n", hparams.n_head);
-            printf("v_n_layer          %d\n", hparams.n_layer);
-
-            printf("\nuse_gelu           %d\n", new_clip->use_gelu);
-            printf("ftype              %d\n\n", new_clip->ftype);
-        }
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *)&n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != new_clip->text_model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", __func__, fname, n_vocab,
-                    new_clip->text_model.hparams.n_vocab);
-            return nullptr;
-        }
-
-        std::string word;
-        std::vector<char> buf(128);
-
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *)&len, sizeof(len));
-
-            buf.resize(len);
-            fin.read((char *)buf.data(), len);
-            word.assign(buf.data(), len);
-
-            new_clip->vocab.token_to_id[word] = i;
-            new_clip->vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = GGML_TYPE_COUNT;
-    switch (new_clip->ftype) {
-    case 0:
-        wtype = GGML_TYPE_F32;
-        break;
-    case 1:
-        wtype = GGML_TYPE_F16;
-        break;
-    case 2:
-        wtype = GGML_TYPE_Q4_0;
-        break;
-    case 3:
-        wtype = GGML_TYPE_Q4_1;
-        break;
-    default: {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname, new_clip->ftype);
-        clip_free(new_clip);
-        return nullptr;
-    }
-    }
-
-    auto & ctx = new_clip->ctx;
-    size_t model_mem_req = 0;
-
-    {
-        // calculate memory requirement for text_model
-        const auto & hparams = text_model.hparams;
-
-        const int n_vocab = hparams.n_vocab;
-        const int num_positions = hparams.num_positions;
-        const int hidden_size = hparams.hidden_size;
-        const int n_layer = hparams.n_layer;
-        const int n_intermediate = hparams.n_intermediate;
-        const int projection_dim = hparams.projection_dim;
-
-        // Calculate size requirements
-
-        model_mem_req += hidden_size * n_vocab * ggml_type_sizef(wtype);       // token_embeddings
-        model_mem_req += hidden_size * num_positions * ggml_type_sizef(wtype); // position_embeddings
-
-        model_mem_req += 4 * n_layer * (hidden_size * ggml_type_sizef(GGML_TYPE_F32)); // ln_1_* and ln_2_*
-
-        model_mem_req += 4 * n_layer * (hidden_size * hidden_size * ggml_type_sizef(wtype));    // kqvo weights
-        model_mem_req += 4 * n_layer * (hidden_size * ggml_type_sizef(GGML_TYPE_F32));          // kqvo bias
-        model_mem_req += 2 * n_layer * (hidden_size * n_intermediate * ggml_type_sizef(wtype)); // ff_*_w
-        model_mem_req += n_layer * (n_intermediate * ggml_type_sizef(GGML_TYPE_F32));           // ff_i_b
-        model_mem_req += n_layer * (hidden_size * ggml_type_sizef(GGML_TYPE_F32));              // ff_o_b
-
-        model_mem_req += 2 * hidden_size * ggml_type_sizef(GGML_TYPE_F32);          // post_ln_*
-        model_mem_req += 2 * hidden_size * projection_dim * ggml_type_sizef(wtype); // projection
-
-        model_mem_req += (5 + 16 * n_layer) * 256; // object overhead
-    }
-
-    {
-        // calculate memory requirement for vision_model
-        const auto & hparams = vision_model.hparams;
-
-        const int image_size = hparams.image_size;
-        const int patch_size = hparams.patch_size;
-        const int num_patches = ((image_size / patch_size) * (image_size / patch_size)) + 1;
-        const int hidden_size = hparams.hidden_size;
-        const int n_layer = hparams.n_layer;
-        const int n_intermediate = hparams.n_intermediate;
-        const int projection_dim = hparams.projection_dim;
-
-        // Calculate size requirements
-
-        model_mem_req += hidden_size * ggml_type_sizef(GGML_TYPE_F32);                               // class_embedding
-        model_mem_req += hidden_size * 3 * patch_size * patch_size * ggml_type_sizef(GGML_TYPE_F16); // patch_embeddings
-        model_mem_req += hidden_size * num_patches * ggml_type_sizef(wtype);                         // position_embeddings
-
-        model_mem_req += 2 * hidden_size * ggml_type_sizef(GGML_TYPE_F32); // pre_ln_*
-
-        model_mem_req += 4 * n_layer * (hidden_size * ggml_type_sizef(GGML_TYPE_F32)); // ln_*
-
-        model_mem_req += 4 * n_layer * (hidden_size * hidden_size * ggml_type_sizef(wtype)); // kqvo weights
-        model_mem_req += 4 * n_layer * (hidden_size * ggml_type_sizef(GGML_TYPE_F32));       // kqvo bias
-
-        model_mem_req += 2 * n_layer * (hidden_size * n_intermediate * ggml_type_sizef(wtype)); // ff_*_w
-        model_mem_req += n_layer * (n_intermediate * ggml_type_sizef(GGML_TYPE_F32));           // ff_i_b
-        model_mem_req += n_layer * (hidden_size * ggml_type_sizef(GGML_TYPE_F32));              // ff_o_b
-
-        model_mem_req += 2 * hidden_size * ggml_type_sizef(GGML_TYPE_F32);          // post_ln_*
-        model_mem_req += 2 * hidden_size * projection_dim * ggml_type_sizef(wtype); // projection
-
-        model_mem_req += (5 + 16 * n_layer) * 256; // object overhead
-    }
-
-    if (verbosity >= 2) {
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, model_mem_req / (1024.0 * 1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            .mem_size = model_mem_req,
-            .mem_buffer = NULL,
-            .no_alloc = false,
-        };
-
-        new_clip->ctx = ggml_init(params);
-        if (!new_clip->ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            clip_free(new_clip);
-            return nullptr;
-        }
-    }
-
-    // prepare memory for the text_model weights
-    {
-        const auto & hparams = text_model.hparams;
-
-        const int n_vocab = hparams.n_vocab;
-        const int num_positions = hparams.num_positions;
-        const int hidden_size = hparams.hidden_size;
-        const int n_layer = hparams.n_layer;
-        const int n_intermediate = hparams.n_intermediate;
-        const int projection_dim = hparams.projection_dim;
-
-        text_model.layers.resize(n_layer);
-
-        text_model.token_embeddings = ggml_new_tensor_2d(ctx, wtype, hidden_size, n_vocab);
-        text_model.position_embeddings = ggml_new_tensor_2d(ctx, wtype, hidden_size, num_positions);
-
-        // map by name
-        text_model.tensors["text_model.embeddings.token_embedding.weight"] = text_model.token_embeddings;
-        text_model.tensors["text_model.embeddings.position_embedding.weight"] = text_model.position_embeddings;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = text_model.layers[i];
-
-            layer.ln_1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-            layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-            layer.ln_2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-            layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-
-            layer.q_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
-            layer.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-            layer.k_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
-            layer.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-            layer.v_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
-            layer.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-            layer.o_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
-            layer.o_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-
-            layer.ff_i_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, n_intermediate);
-            layer.ff_i_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_intermediate);
-
-            layer.ff_o_w = ggml_new_tensor_2d(ctx, wtype, n_intermediate, hidden_size);
-            layer.ff_o_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-
-            // map by name
-
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.k_proj.weight"] = layer.k_w;
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.k_proj.bias"] = layer.k_b;
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.v_proj.weight"] = layer.v_w;
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.v_proj.bias"] = layer.v_b;
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.q_proj.weight"] = layer.q_w;
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.q_proj.bias"] = layer.q_b;
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.out_proj.weight"] = layer.o_w;
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".self_attn.out_proj.bias"] = layer.o_b;
-
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".layer_norm1.weight"] = layer.ln_1_w;
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".layer_norm1.bias"] = layer.ln_1_b;
-
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".mlp.fc1.weight"] = layer.ff_i_w;
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".mlp.fc1.bias"] = layer.ff_i_b;
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".mlp.fc2.weight"] = layer.ff_o_w;
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".mlp.fc2.bias"] = layer.ff_o_b;
-
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".layer_norm2.weight"] = layer.ln_2_w;
-            text_model.tensors["text_model.encoder.layers." + std::to_string(i) + ".layer_norm2.bias"] = layer.ln_2_b;
-        }
-
-        text_model.post_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-        text_model.post_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-        text_model.projection = ggml_new_tensor_2d(ctx, wtype, hidden_size, projection_dim);
-
-        // map by name
-        text_model.tensors["text_model.final_layer_norm.weight"] = text_model.post_ln_w;
-        text_model.tensors["text_model.final_layer_norm.bias"] = text_model.post_ln_b;
-        text_model.tensors["text_projection.weight"] = text_model.projection;
-    }
-
-    // prepare memory for the vision_model weights
-    {
-        const auto & hparams = vision_model.hparams;
-
-        const int image_size = hparams.image_size;
-        const int patch_size = hparams.patch_size;
-        const int num_patches = ((image_size / patch_size) * (image_size / patch_size)) + 1;
-        const int hidden_size = hparams.hidden_size;
-        const int n_layer = hparams.n_layer;
-        const int n_intermediate = hparams.n_intermediate;
-        const int projection_dim = hparams.projection_dim;
-
-        vision_model.layers.resize(n_layer);
-
-        vision_model.class_embedding = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-        vision_model.patch_embeddings = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, patch_size, patch_size, 3, hidden_size);
-        vision_model.position_embeddings = ggml_new_tensor_2d(ctx, wtype, hidden_size, num_patches);
-
-        vision_model.pre_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-        vision_model.pre_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-
-        // map by name
-        vision_model.tensors["vision_model.embeddings.class_embedding"] = vision_model.class_embedding;
-        vision_model.tensors["vision_model.embeddings.patch_embedding.weight"] = vision_model.patch_embeddings;
-        vision_model.tensors["vision_model.embeddings.position_embedding.weight"] = vision_model.position_embeddings;
-
-        vision_model.tensors["vision_model.pre_layrnorm.weight"] = vision_model.pre_ln_w;
-        vision_model.tensors["vision_model.pre_layrnorm.bias"] = vision_model.pre_ln_b;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = vision_model.layers[i];
-
-            layer.ln_1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-            layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-            layer.ln_2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-            layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-
-            layer.q_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
-            layer.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-            layer.k_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
-            layer.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-            layer.v_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
-            layer.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-            layer.o_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
-            layer.o_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-
-            layer.ff_i_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, n_intermediate);
-            layer.ff_i_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_intermediate);
-
-            layer.ff_o_w = ggml_new_tensor_2d(ctx, wtype, n_intermediate, hidden_size);
-            layer.ff_o_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-
-            // map by name
-
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.k_proj.weight"] = layer.k_w;
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.k_proj.bias"] = layer.k_b;
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.v_proj.weight"] = layer.v_w;
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.v_proj.bias"] = layer.v_b;
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.q_proj.weight"] = layer.q_w;
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.q_proj.bias"] = layer.q_b;
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.out_proj.weight"] = layer.o_w;
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".self_attn.out_proj.bias"] = layer.o_b;
-
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".layer_norm1.weight"] = layer.ln_1_w;
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".layer_norm1.bias"] = layer.ln_1_b;
-
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".mlp.fc1.weight"] = layer.ff_i_w;
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".mlp.fc1.bias"] = layer.ff_i_b;
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".mlp.fc2.weight"] = layer.ff_o_w;
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".mlp.fc2.bias"] = layer.ff_o_b;
-
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".layer_norm2.weight"] = layer.ln_2_w;
-            vision_model.tensors["vision_model.encoder.layers." + std::to_string(i) + ".layer_norm2.bias"] = layer.ln_2_b;
-        }
-
-        vision_model.post_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-        vision_model.post_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-        vision_model.projection = ggml_new_tensor_2d(ctx, wtype, hidden_size, projection_dim);
-
-        // map by name
-        vision_model.tensors["vision_model.post_layernorm.weight"] = vision_model.post_ln_w;
-        vision_model.tensors["vision_model.post_layernorm.bias"] = vision_model.post_ln_b;
-        vision_model.tensors["visual_projection.weight"] = vision_model.projection;
-    }
-
-    // load weights
-    {
-        int n_tensors = 0;
-        size_t total_size = 0;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ftype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int64_t nelements = 1;
-            int64_t ne[4] = {1, 1, 1, 1};
-            for (int i = 0; i < n_dims; ++i) {
-                int32_t ne_cur;
-                fin.read(reinterpret_cast<char *>(&ne_cur), sizeof(ne_cur));
-                ne[i] = ne_cur;
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            struct ggml_tensor * tensor;
-            if (text_model.tensors.find(name.data()) != text_model.tensors.end()) {
-                tensor = text_model.tensors[name.data()];
-            } else if (vision_model.tensors.find(name.data()) != vision_model.tensors.end()) {
-                tensor = vision_model.tensors[name.data()];
-            } else {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                clip_free(new_clip);
-                return nullptr;
-            }
-
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                clip_free(new_clip);
-                return nullptr;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n",
-                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
-                clip_free(new_clip);
-                return nullptr;
-            }
-
-            if (0) {
-                static const char * ftype_str[] = {
-                    "f32",
-                    "f16",
-                    "q4_0",
-                    "q4_1",
-                };
-                printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype],
-                       ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor));
-            }
-
-            size_t bpe = 0;
-
-            switch (ftype) {
-            case 0:
-                bpe = ggml_type_size(GGML_TYPE_F32);
-                break;
-            case 1:
-                bpe = ggml_type_size(GGML_TYPE_F16);
-                break;
-            case 2:
-                bpe = ggml_type_size(GGML_TYPE_Q4_0);
-                assert(ne[0] % 64 == 0);
-                break;
-            case 3:
-                bpe = ggml_type_size(GGML_TYPE_Q4_1);
-                assert(ne[0] % 64 == 0);
-                break;
-            default: {
-                fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
-                clip_free(new_clip);
-                return nullptr;
-            }
-            };
-
-            if ((nelements * bpe) / ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", __func__, name.data(),
-                        ggml_nbytes(tensor), nelements * bpe);
-                clip_free(new_clip);
-                return nullptr;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-#ifdef CLIP_DEBUG_TENSORS
-            printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16",
-                   ggml_nbytes(tensor) / 1024.0 / 1024.0);
-#endif
-
-            total_size += ggml_nbytes(tensor);
-            if (verbosity >= 1) {
-                if (++n_tensors % 8 == 0) {
-                    printf(".");
-                    fflush(stdout);
-                }
-            }
-        }
-
-        if (verbosity >= 1) {
-            printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size / 1024.0 / 1024.0, n_tensors);
-        }
-    }
-
-    fin.close();
-
-    // Calculate space requirements for setting up context buffers later
-    {
-        // TODO: We currently get the size of memory requirement from the pre-computed information
-        // based on the model variant, indicated by  the number of tensors.
-        // Rewrite this logic when GGML implements a mechanism to predict the required memory.
-        const size_t n_tensors = new_clip->text_model.tensors.size() + new_clip->vision_model.tensors.size();
-        const int n_image_positions = (vision_model.hparams.image_size / vision_model.hparams.patch_size) *
-                                          (vision_model.hparams.image_size / vision_model.hparams.patch_size) +
-                                      1;
-        size_t mem_req = get_mem_req_by_size(n_tensors, n_image_positions);
-        new_clip->buf_compute.resize(mem_req);
-
-        if (verbosity >= 2) {
-            printf("%s: %zu MB of compute buffer allocated\n", __func__, mem_req / 1024 / 1024);
-        }
-    }
-
-    if (verbosity >= 1) {
-        printf("%s: model loaded\n\n", __func__);
-    }
-
-    return new_clip;
-}
-
 void clip_free(clip_ctx * ctx) {
     ggml_free(ctx->ctx);
+    gguf_free(ctx->ctx_gguf);
     delete ctx;
 }
 
-bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_vocab::id> & tokens, float * vec) {
+bool clip_text_encode(const clip_ctx * ctx, const int n_threads, const clip_tokens * tokens, float * vec,
+                      const bool normalize) {
+    if (!ctx->has_text_encoder) {
+        printf("This GGUF file seems to have no text encoder\n");
+        return false;
+    }
+
     const auto & model = ctx->text_model;
     const auto & hparams = model.hparams;
-    const int N = tokens.size();
+    const size_t N = tokens->size;
 
     const int n_vocab = hparams.n_vocab;
     const int num_positions = hparams.num_positions;
@@ -822,6 +880,7 @@ bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<cli
     const int n_layer = hparams.n_layer;
     const int n_intermediate = hparams.n_intermediate;
     const int projection_dim = hparams.projection_dim;
+    const float eps = hparams.eps;
 
     auto & buf_compute = ctx->buf_compute;
 
@@ -834,11 +893,11 @@ bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<cli
     struct ggml_context * ctx0 = ggml_init(params);
     struct ggml_cgraph gf = {};
 
-    static size_t scr0_size = get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), N);
+    static size_t scr0_size = get_scr_buf_req_by_size((struct clip_ctx *)ctx);
     static void * scr0 = malloc(scr0_size);
 
     struct ggml_tensor * input_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(input_ids->data, tokens.data(), N * ggml_element_size(input_ids));
+    memcpy(input_ids->data, tokens->data, N * ggml_element_size(input_ids));
 
     struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     for (int i = 0; i < N; i++) {
@@ -857,7 +916,7 @@ bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<cli
 
         // layernorm1
         {
-            cur = ggml_norm(ctx0, cur);
+            cur = ggml_norm(ctx0, cur, eps);
 
             cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_w, cur), cur),
                            ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
@@ -907,7 +966,7 @@ bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<cli
 
         // layernorm2
         {
-            cur = ggml_norm(ctx0, cur);
+            cur = ggml_norm(ctx0, cur, eps);
 
             cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_w, cur), cur),
                            ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
@@ -933,7 +992,7 @@ bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<cli
 
     // final -layer_norm
     {
-        embeddings = ggml_norm(ctx0, embeddings);
+        embeddings = ggml_norm(ctx0, embeddings, eps);
 
         embeddings = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.post_ln_w, embeddings), embeddings),
                               ggml_repeat(ctx0, model.post_ln_b, embeddings));
@@ -949,8 +1008,10 @@ bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<cli
     embeddings = ggml_mul_mat(ctx0, model.projection, embeddings);
 
     // normalize output embeddings
-    ggml_tensor * length = ggml_sqrt(ctx0, ggml_sum(ctx0, ggml_sqr(ctx0, embeddings)));
-    embeddings = ggml_scale_inplace(ctx0, embeddings, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
+    if (normalize) {
+        ggml_tensor * length = ggml_sqrt(ctx0, ggml_sum(ctx0, ggml_sqr(ctx0, embeddings)));
+        embeddings = ggml_scale_inplace(ctx0, embeddings, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
+    }
 
     ggml_set_name(embeddings, "check");
 
@@ -1019,13 +1080,26 @@ bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<cli
     return true;
 }
 
-bool clip_image_encode(const clip_ctx * ctx, int n_threads, const clip_image_f32 & img, float * vec) {
-    std::vector<clip_image_f32> imgs;
-    imgs.push_back(img);
-    return clip_image_batch_encode(ctx, n_threads, imgs, vec);
+bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, const bool normalize) {
+    if (!ctx->has_vision_encoder) {
+        printf("This gguf file seems to have no vision encoder\n");
+        return false;
+    }
+
+    clip_image_f32_batch imgs{};
+    imgs.size = 1;
+    imgs.data = img;
+    return clip_image_batch_encode(ctx, n_threads, &imgs, vec, normalize);
 }
 
-bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_image_f32> & imgs, float * vec) {
+bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec,
+                             const bool normalize) {
+
+    if (!ctx->has_vision_encoder) {
+        printf("This gguf file seems to have no vision encoder\n");
+        return false;
+    }
+
     const auto & model = ctx->vision_model;
     const auto & hparams = model.hparams;
 
@@ -1039,7 +1113,8 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec
     const int n_layer = hparams.n_layer;
     const int n_intermediate = hparams.n_intermediate;
     const int projection_dim = hparams.projection_dim;
-    int batch_size = imgs.size();
+    const float eps = hparams.eps;
+    int batch_size = imgs->size;
 
     auto & buf_compute = ctx->buf_compute;
 
@@ -1052,8 +1127,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec
     struct ggml_context * ctx0 = ggml_init(params);
     struct ggml_cgraph gf = {};
 
-    static size_t scr0_size =
-        get_scr_buf_req_by_size(ctx->text_model.tensors.size() + ctx->vision_model.tensors.size(), num_positions);
+    static size_t scr0_size = get_scr_buf_req_by_size((struct clip_ctx *)ctx);
     static void * scr0 = malloc(scr0_size);
 
     struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
@@ -1061,9 +1135,9 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec
     {
         float * data = (float *)ggml_get_data(inp_raw);
 
-        for (int b = 0; b < imgs.size(); b++) {
-            const int nx = imgs[b].nx;
-            const int ny = imgs[b].ny;
+        for (int b = 0; b < imgs->size; b++) {
+            const int nx = imgs->data[b].nx;
+            const int ny = imgs->data[b].ny;
             GGML_ASSERT(nx == image_size && ny == image_size);
 
             const int n = nx * ny;
@@ -1072,7 +1146,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec
                 for (int k = 0; k < 3; k++) {
                     for (int y = 0; y < ny; y++) {
                         for (int x = 0; x < nx; x++) {
-                            data[(b * 3 * n) + k * n + y * nx + x] = imgs[b].data[3 * (y * nx + x) + k];
+                            data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].data[3 * (y * nx + x) + k];
                         }
                     }
                 }
@@ -1106,7 +1180,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec
 
     // pre-layernorm
     {
-        embeddings = ggml_norm(ctx0, embeddings);
+        embeddings = ggml_norm(ctx0, embeddings, eps);
 
         embeddings = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.pre_ln_w, embeddings), embeddings),
                               ggml_repeat(ctx0, model.pre_ln_b, embeddings));
@@ -1122,7 +1196,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec
 
         // layernorm1
         {
-            cur = ggml_norm(ctx0, cur);
+            cur = ggml_norm(ctx0, cur, eps);
 
             cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_w, cur), cur),
                            ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
@@ -1172,7 +1246,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec
 
         // layernorm2
         {
-            cur = ggml_norm(ctx0, cur);
+            cur = ggml_norm(ctx0, cur, eps);
 
             cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_w, cur), cur),
                            ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
@@ -1205,7 +1279,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec
 
     // post-layernorm
     {
-        embeddings = ggml_norm(ctx0, embeddings);
+        embeddings = ggml_norm(ctx0, embeddings, eps);
 
         embeddings = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.post_ln_w, embeddings), embeddings),
                               ggml_repeat(ctx0, model.post_ln_b, embeddings));
@@ -1221,8 +1295,10 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec
 
     for (int b = 0; b < batch_size; b++) {
         struct ggml_tensor * embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, b));
-        ggml_tensor * length = ggml_sqrt(ctx0, ggml_sum(ctx0, ggml_sqr(ctx0, embedding)));
-        embedding = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
+        if (normalize) {
+            ggml_tensor * length = ggml_sqrt(ctx0, ggml_sum(ctx0, ggml_sqr(ctx0, embedding)));
+            embedding = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
+        }
         output = ggml_acc(ctx0, output, embedding, output->nb[1], output->nb[2], output->nb[3], b * ggml_nbytes(embedding));
     }
     ggml_set_name(output, "check");
@@ -1230,6 +1306,7 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec
     // run the computation
     ggml_build_forward_expand(&gf, output);
     ggml_cplan cplan = ggml_graph_plan(&gf, n_threads);
+    cplan.work_size *= batch_size;
     if (cplan.work_size != 0) {
         cplan.work_data = (uint8_t *)malloc(cplan.work_size);
     }
@@ -1293,39 +1370,45 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec
     return true;
 }
 
-float clip_similarity_score(float * vec1, float * vec2, int vec_dim) {
+float clip_similarity_score(const float * vec1, const float * vec2, const int vec_dim) {
     float dot_product = 0.0;
     for (int i = 0; i < vec_dim; i++) {
         dot_product += vec1[i] * vec2[i];
     }
 
-    // Clamp the dot product to the range [0, 1].
-    float clamped_dot_product = fmin(fmax(dot_product, 0.0), 1.0);
-
-    return clamped_dot_product;
+    return dot_product;
 }
 
-bool clip_compare_text_and_image(clip_ctx * ctx, int n_threads, std::string & text, clip_image_u8 & image, float * score) {
+bool clip_compare_text_and_image(const clip_ctx * ctx, const int n_threads, const char * text, const clip_image_u8 * image,
+                                 float * score) {
+    if (!(ctx->has_text_encoder && ctx->has_vision_encoder)) {
+        printf("clip_compare_text_and_image function can only be used with two-tower models\n");
+        return false;
+    }
+
     // prepare image and text vectors
     const int projection_dim = ctx->vision_model.hparams.projection_dim;
     float img_vec[projection_dim];
     float txt_vec[projection_dim];
 
-    // preprocess and encode image
-    clip_image_f32 img_res;
-
-    if (!clip_image_preprocess(ctx, &image, &img_res)) {
+    // tokenize and encode text
+    clip_tokens tokens;
+    if (!clip_tokenize(ctx, text, &tokens)) {
         return false;
     }
 
-    if (!clip_image_encode(ctx, n_threads, img_res, img_vec)) {
+    if (!clip_text_encode(ctx, n_threads, &tokens, txt_vec, true)) {
         return false;
     }
 
-    // tokenize and encode text
-    auto tokens = clip_tokenize(ctx, text);
+    // preprocess and encode image
+    clip_image_f32 img_res;
 
-    if (!clip_text_encode(ctx, n_threads, tokens, txt_vec)) {
+    if (!clip_image_preprocess(ctx, image, &img_res)) {
+        return false;
+    }
+
+    if (!clip_image_encode(ctx, n_threads, &img_res, img_vec, true)) {
         return false;
     }
 
@@ -1353,23 +1436,17 @@ int compare_scores(const void * a, const void * b) {
     }
 }
 
-bool softmax_with_sorting(float * arr, int length, float * sorted_scores, int * indices) {
+bool softmax_with_sorting(float * arr, const int length, float * sorted_scores, int * indices) {
     ScoreIndexPair * score_index_pairs = (ScoreIndexPair *)malloc(length * sizeof(ScoreIndexPair));
     if (!score_index_pairs) {
         return false;
     }
 
     // Calculate softmax probabilities
-    float max_val = arr[0];
-    for (int i = 1; i < length; i++) {
-        if (arr[i] > max_val) {
-            max_val = arr[i];
-        }
-    }
 
-    float sum = 0.0;
+    double sum = 0.0;
     for (int i = 0; i < length; i++) {
-        arr[i] = exp(arr[i] - max_val);
+        arr[i] = exp(arr[i]) + 1e-9;
         sum += arr[i];
     }
 
@@ -1392,23 +1469,227 @@ bool softmax_with_sorting(float * arr, int length, float * sorted_scores, int *
     return true;
 }
 
-bool image_normalize(clip_image_u8 * img, clip_image_f32 * res) {
-    if (img->nx != 224 || img->ny != 224) {
-        printf("%s: long input shape: %d x %d\n", __func__, img->nx, img->ny);
+bool clip_zero_shot_label_image(struct clip_ctx * ctx, const int n_threads, const struct clip_image_u8 * input_img,
+                                const char ** labels, const size_t n_labels, float * scores, int * indices) {
+    if (!(ctx->has_text_encoder && ctx->has_vision_encoder)) {
+        printf("clip_zero_shot_label_image function can only be used with two-tower models\n");
         return false;
     }
 
-    const float m3[3] = {0.48145466f, 0.4578275f, 0.40821073f};
-    const float s3[3] = {0.26862954f, 0.26130258f, 0.27577711f};
+    // load the image
+    clip_image_f32 img_res;
+
+    const int vec_dim = clip_get_vision_hparams(ctx)->projection_dim;
 
-    for (int y = 0; y < img->ny; y++) {
-        for (int x = 0; x < img->nx; x++) {
-            for (int c = 0; c < 3; c++) {
-                const int i = 3 * (y * img->nx + x) + c;
-                float v = (float)img->data[i];
-                res->data[i] = ((v / 255.0f) - m3[c]) / s3[c];
+    clip_image_preprocess(ctx, input_img, &img_res);
+
+    float img_vec[vec_dim];
+    if (!clip_image_encode(ctx, n_threads, &img_res, img_vec, false)) {
+        return false;
+    }
+
+    // encode texts and compute similarities
+    float txt_vec[vec_dim];
+    float similarities[n_labels];
+
+    for (int i = 0; i < n_labels; i++) {
+        const auto & text = labels[i];
+        clip_tokens tokens;
+        clip_tokenize(ctx, text, &tokens);
+        clip_text_encode(ctx, n_threads, &tokens, txt_vec, false);
+        similarities[i] = clip_similarity_score(img_vec, txt_vec, vec_dim);
+    }
+
+    // apply softmax and sort scores
+    softmax_with_sorting(similarities, n_labels, scores, indices);
+
+    return true;
+}
+
+bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
+
+    ggml_type type = GGML_TYPE_Q4_1;
+
+    switch (itype) {
+    case 2:
+        type = GGML_TYPE_Q4_0;
+        break;
+    case 3:
+        type = GGML_TYPE_Q4_1;
+        break;
+    case 6:
+        type = GGML_TYPE_Q5_0;
+        break;
+    case 7:
+        type = GGML_TYPE_Q5_1;
+        break;
+    case 8:
+        type = GGML_TYPE_Q8_0;
+        break;
+    default:
+        fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
+        return false;
+    };
+
+    auto ctx_clip = clip_model_load(fname_inp, 2);
+    const auto & ctx_src = ctx_clip->ctx_gguf;
+    const auto & ctx_data = ctx_clip->ctx;
+
+    auto ctx_out = gguf_init_empty();
+    gguf_set_kv(ctx_out, ctx_src);
+    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
+    gguf_set_val_u32(ctx_out, "general.file_type", itype);
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+
+    const int n_tensors = gguf_get_n_tensors(ctx_src);
+
+    for (int i = 0; i < n_tensors; ++i) {
+        const char * name = gguf_get_tensor_name(ctx_src, i);
+        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
+        gguf_add_tensor(ctx_out, cur);
+    }
+
+    const size_t meta_size = gguf_get_meta_size(ctx_out);
+    for (size_t i = 0; i < meta_size; ++i) {
+        fout.put(0);
+    }
+
+    // regexes of tensor names to be quantized
+    const std::vector<std::string> k_names = {
+        ".*weight",
+    };
+
+    std::vector<uint8_t> read_data(512);
+    std::vector<uint8_t> work(512);
+    std::vector<float> conv_buf(512);
+    std::vector<int64_t> hist_all(1 << 4, 0);
+    size_t total_size_org = 0;
+    size_t total_size_new = 0;
+
+    for (int i = 0; i < n_tensors; ++i) {
+        const std::string name = gguf_get_tensor_name(ctx_src, i);
+        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
+
+        enum ggml_type new_type;
+        void * new_data;
+        size_t new_size;
+
+        bool quantize = false;
+        for (const auto & s : k_names) {
+            if (std::regex_match(name, std::regex(s))) {
+                quantize = true;
+                break;
+            }
+        }
+
+        // quantize only 2D tensors
+        quantize &= (cur->n_dims == 2);
+
+        if (quantize) {
+            new_type = type;
+            const size_t n_elms = ggml_nelements(cur);
+            float * f32_data;
+
+            switch (cur->type) {
+            case GGML_TYPE_F32:
+                f32_data = (float *)cur->data;
+                break;
+            case GGML_TYPE_F16:
+                if (conv_buf.size() < n_elms) {
+                    conv_buf.resize(n_elms);
+                }
+                for (int j = 0; j < n_elms; ++j) {
+                    conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]);
+                }
+                f32_data = (float *)conv_buf.data();
+                break;
+            default:
+                printf("Please use an input file in f32 or f16\n");
+                return false;
+            }
+
+            if (work.size() < n_elms * 4) {
+                work.resize(n_elms * 4);
+            }
+            new_data = work.data();
+
+            std::vector<int64_t> hist_cur(1 << 4, 0);
+
+            switch (new_type) {
+            case GGML_TYPE_Q4_0: {
+                new_size = ggml_quantize_q4_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+            } break;
+            case GGML_TYPE_Q4_1: {
+                new_size = ggml_quantize_q4_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+            } break;
+            case GGML_TYPE_Q5_0: {
+                new_size = ggml_quantize_q5_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+            } break;
+            case GGML_TYPE_Q5_1: {
+                new_size = ggml_quantize_q5_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+            } break;
+            case GGML_TYPE_Q8_0: {
+                new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
+            } break;
+            default: {
+                fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
+                return false;
+            }
             }
+
+            for (int j = 0; j < hist_cur.size(); ++j) {
+                hist_all[j] += hist_cur[j];
+            }
+        } else {
+            new_type = cur->type;
+            new_data = cur->data;
+            new_size = ggml_nbytes(cur);
         }
+        const size_t orig_size = ggml_nbytes(cur);
+        total_size_org += orig_size;
+        total_size_new += new_size;
+        gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
+        gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
+        fout.write((const char *)new_data, new_size);
+        size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
+        for (int j = 0; j < pad; ++j) {
+            fout.put(0);
+        }
+
+        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize,
+               orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+    }
+
+    // go back to beginning of file and write the updated metadata
+    fout.seekp(0, std::ios::beg);
+    std::vector<uint8_t> meta(meta_size);
+    gguf_get_meta_data(ctx_out, meta.data());
+    fout.write((const char *)meta.data(), meta_size);
+
+    fout.close();
+
+    clip_free(ctx_clip);
+    gguf_free(ctx_out);
+
+    {
+        printf("%s: original size  = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        printf("%s: quantized size  = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+
+        int64_t sum_all = 0;
+        for (size_t i = 0; i < hist_all.size(); ++i) {
+            sum_all += hist_all[i];
+        }
+
+        printf("%s: hist: ", __func__);
+        for (size_t i = 0; i < hist_all.size(); ++i) {
+            printf("%5.3f ", hist_all[i] / (float)sum_all);
+        }
+        printf("\n");
     }
+
     return true;
 }
+
+struct clip_text_hparams * clip_get_text_hparams(struct clip_ctx * ctx) { return &ctx->text_model.hparams; }
+struct clip_vision_hparams * clip_get_vision_hparams(struct clip_ctx * ctx) { return &ctx->vision_model.hparams; }
diff --git a/clip.h b/clip.h
index 3cbbc66..18fe3da 100644
--- a/clip.h
+++ b/clip.h
@@ -1,205 +1,106 @@
 #ifndef CLIP_H
 #define CLIP_H
 
-#include "llama.cpp/ggml.h"
-#include <cstring>
-#include <map>
-#include <thread>
-#include <vector>
-
-// TODO: make the API in C
-// #ifdef __cplusplus
-// extern "C" {
-// #endif
-
-// default hparams for text_model (ViT-B/32)
-struct clip_text_hparams {
-    int32_t n_vocab = 49408;
-    int32_t num_positions = 77;
-    int32_t hidden_size = 512;
-    int32_t n_intermediate = 2048;
-    int32_t projection_dim = 512;
-    int32_t n_head = 8;
-    int32_t n_layer = 12;
-};
-
-// default hparams for vision_model (ViT-B/32)
-struct clip_vision_hparams {
-    int32_t image_size = 224;
-    int32_t patch_size = 32;
-    int32_t hidden_size = 768;
-    int32_t n_intermediate = 3072;
-    int32_t projection_dim = 512;
-    int32_t n_head = 12;
-    int32_t n_layer = 12;
-};
-
-//
-// Vocab utils
-//
-
-std::string trim(const std::string & s);
-
-std::string replace(const std::string & s, const std::string & from, const std::string & to);
+#include "ggml.h"
 
-struct clip_vocab {
-    using id = int32_t;
-    using token = std::string;
+struct clip_ctx;
 
-    std::map<token, id> token_to_id;
-    std::map<id, token> id_to_token;
-    std::vector<std::string> special_tokens;
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-    void add_special_token(const std::string & token);
-};
-
-std::string convert_to_utf8(const std::wstring & input);
-
-std::wstring convert_to_wstring(const std::string & input);
-
-// split text into tokens
-//
-// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
-//
-// Regex (Python):
-// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
-//
-// Regex (C++):
-// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
-//
-
-struct clip_layer {
-    // attention
-    struct ggml_tensor * k_w;
-    struct ggml_tensor * k_b;
-    struct ggml_tensor * q_w;
-    struct ggml_tensor * q_b;
-    struct ggml_tensor * v_w;
-    struct ggml_tensor * v_b;
-
-    struct ggml_tensor * o_w;
-    struct ggml_tensor * o_b;
-
-    // layernorm 1
-    struct ggml_tensor * ln_1_w;
-    struct ggml_tensor * ln_1_b;
-
-    // ff
-    struct ggml_tensor * ff_i_w;
-    struct ggml_tensor * ff_i_b;
-
-    struct ggml_tensor * ff_o_w;
-    struct ggml_tensor * ff_o_b;
-
-    // layernorm 2
-    struct ggml_tensor * ln_2_w;
-    struct ggml_tensor * ln_2_b;
+struct clip_text_hparams {
+    int32_t n_vocab;
+    int32_t num_positions;
+    int32_t hidden_size;
+    int32_t n_intermediate;
+    int32_t projection_dim;
+    int32_t n_head;
+    int32_t n_layer;
+    float eps;
 };
 
-struct clip_text_model {
-    clip_text_hparams hparams;
-
-    // embeddings
-    struct ggml_tensor * token_embeddings;
-    struct ggml_tensor * position_embeddings;
-
-    std::vector<clip_layer> layers;
-
-    struct ggml_tensor * post_ln_w;
-    struct ggml_tensor * post_ln_b;
-
-    struct ggml_tensor * projection;
-
-    std::map<std::string, struct ggml_tensor *> tensors;
+struct clip_vision_hparams {
+    int32_t image_size;
+    int32_t patch_size;
+    int32_t hidden_size;
+    int32_t n_intermediate;
+    int32_t projection_dim;
+    int32_t n_head;
+    int32_t n_layer;
+    float eps;
 };
 
-struct clip_vision_model {
-    clip_vision_hparams hparams;
-
-    // embeddings
-    struct ggml_tensor * class_embedding;
-    struct ggml_tensor * patch_embeddings;
-    struct ggml_tensor * position_embeddings;
-
-    struct ggml_tensor * pre_ln_w;
-    struct ggml_tensor * pre_ln_b;
-
-    std::vector<clip_layer> layers;
-
-    struct ggml_tensor * post_ln_w;
-    struct ggml_tensor * post_ln_b;
-
-    struct ggml_tensor * projection;
-
-    std::map<std::string, struct ggml_tensor *> tensors;
+typedef int32_t clip_vocab_id;
+struct clip_tokens {
+    clip_vocab_id * data;
+    size_t size;
 };
 
 struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
 
-// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
-struct clip_buffer {
-    uint8_t * data = NULL;
-    size_t size = 0;
-
-    void resize(size_t size) {
-        delete[] data;
-        data = new uint8_t[size];
-        this->size = size;
-    }
+void clip_free(struct clip_ctx * ctx);
 
-    ~clip_buffer() { delete[] data; }
-};
-
-struct clip_ctx {
-    clip_text_model text_model;
-    clip_vision_model vision_model;
-    clip_vocab vocab;
-    int32_t use_gelu = 0;
-    int32_t ftype = 1;
-    ggml_context * ctx;
-    clip_buffer buf_compute;
-};
-
-void clip_free(clip_ctx * ctx);
+struct clip_text_hparams * clip_get_text_hparams(struct clip_ctx * ctx);
+struct clip_vision_hparams * clip_get_vision_hparams(struct clip_ctx * ctx);
 
 // RGB uint8 image
 struct clip_image_u8 {
     int nx;
     int ny;
-
-    std::vector<uint8_t> data;
+    uint8_t * data;
+    size_t size;
 };
 
-// RGB float32 image
+// RGB float32 image (NHWC)
 // Memory layout: RGBRGBRGB...
 struct clip_image_f32 {
     int nx;
     int ny;
+    float * data;
+    size_t size;
+};
+
+struct clip_image_u8_batch {
+    struct clip_image_u8 * data;
+    size_t size;
+};
 
-    std::vector<float> data;
+struct clip_image_f32_batch {
+    struct clip_image_f32 * data;
+    size_t size;
 };
 
-std::vector<clip_vocab::id> clip_tokenize(const clip_ctx * ctx, const std::string & text);
+bool clip_tokenize(const struct clip_ctx * ctx, const char * text, struct clip_tokens * tokens);
 
-bool clip_image_load_from_file(const std::string & fname, clip_image_u8 & img);
-bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res);
-void clip_image_batch_preprocess(const clip_ctx * ctx, const int n_threads, const std::vector<clip_image_u8> & img_inputs,
-                                 std::vector<clip_image_f32> & img_resized);
+struct clip_image_u8 * make_clip_image_u8();
+struct clip_image_f32 * make_clip_image_f32();
+bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res);
 
-bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_vocab::id> & tokens, float * vec);
+bool clip_text_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_tokens * tokens, float * vec,
+                      const bool normalize);
+bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec,
+                       const bool normalize);
 
-bool clip_image_encode(const clip_ctx * ctx, int n_threads, const clip_image_f32 & img, float * vec);
+void clip_image_batch_preprocess(const struct clip_ctx * ctx, const int n_threads,
+                                 const struct clip_image_u8_batch * img_inputs, struct clip_image_f32_batch * imgs_resized);
+bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
+                             float * vec, const bool normalize);
 
-// bool image_normalize(clip_image_u8 *img, clip_image_f32 *res);
+// bool image_normalize(const clip_image_u8 *img, clip_image_f32 *res);
 
-bool clip_compare_text_and_image(clip_ctx * ctx, int n_threads, std::string & text, clip_image_u8 & image, float * score);
-float clip_similarity_score(float * vec1, float * vec2, int vec_dim);
-bool softmax_with_sorting(float * arr, int length, float * sorted_scores, int * indices);
+bool clip_compare_text_and_image(const struct clip_ctx * ctx, const int n_threads, const char * text,
+                                 const struct clip_image_u8 * image, float * score);
+float clip_similarity_score(const float * vec1, const float * vec2, const int vec_dim);
+bool softmax_with_sorting(float * arr, const int length, float * sorted_scores, int * indices);
+bool clip_zero_shot_label_image(struct clip_ctx * ctx, const int n_threads, const struct clip_image_u8 * input_img,
+                                const char ** labels, const size_t n_labels, float * scores, int * indices);
 
-bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_image_f32> & imgs, float * vec);
+bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
 
-// #ifdef __cplusplus
-// }
-// #endif
+#ifdef __cplusplus
+}
+#endif
 
-#endif // CLIP_H
\ No newline at end of file
+#endif // CLIP_H
diff --git a/llama.cpp b/llama.cpp
index 294f424..40e07a6 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit 294f424554c1599784ac9962462fc39ace92d8a5
+Subproject commit 40e07a60f9ce06e79f3ccd4c903eba300fb31b5e
diff --git a/main.cpp b/main.cpp
index 4b8de59..0904b59 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,7 +1,11 @@
 #include "clip.h"
-#include "llama.h"
 
-int main() {
-    printf("Hello from LLava.cpp\n");
+int main(int argc, char ** argv) {
+    const char * path = argv[1];
+
+    auto p_ctx = clip_model_load(path, 1);
+
+    clip_free(p_ctx);
+
     return 0;
-}
\ No newline at end of file
+}
diff --git a/models/convert_hf_to_gguf.py b/models/convert_hf_to_gguf.py
new file mode 100644
index 0000000..2d1a47c
--- /dev/null
+++ b/models/convert_hf_to_gguf.py
@@ -0,0 +1,240 @@
+import argparse
+import os
+import json
+
+import torch
+import numpy as np
+from gguf import *
+from transformers import CLIPModel, CLIPProcessor
+
+TEXT = "clip.text"
+VISION = "clip.vision"
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
+    if name in (
+        "logit_scale",
+        "text_model.embeddings.position_ids",
+        "vision_model.embeddings.position_ids",
+    ):
+        return True
+    
+    if name == "visual_projection.weight" and has_llava:
+        return True
+    
+    if name.startswith("v") and not has_vision:
+        return True
+    
+    if name.startswith("t") and not has_text:
+        return True
+    
+    return False
+
+def get_tensor_name(name: str) -> str:
+    if "projection" in name:
+        return name
+    
+    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument("--text-only", action="store_true", required=False, help="Save a text-only model. It can't be used to encode images")
+ap.add_argument("--vision-only", action="store_true", required=False, help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--llava-projector", help="Path to projector.pt file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
+ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+
+args = ap.parse_args()
+
+    
+if args.text_only and args.vision_only:
+    print("--text-only and --image-only arguments cannot be specified at the same time.")
+    exit(1)
+
+if args.use_f32:
+    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+
+
+with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+    vocab = json.load(f)
+    tokens = [key for key in vocab]
+    
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    config = json.load(f)
+    v_hparams = config["vision_config"]
+    t_hparams = config["text_config"]
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+    ftype = 0
+
+        
+model = CLIPModel.from_pretrained(dir_model)
+processor = CLIPProcessor.from_pretrained(dir_model)
+
+fname_middle = None
+has_text_encoder = True
+has_vision_encoder = True
+has_llava_projector = False
+if args.text_only:
+    fname_middle = "text-"
+    has_vision_encoder = False
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
+elif args.llava_projector is not None:
+    fname_middle = "mmproj-"
+    has_text_encoder = False
+    has_llava_projector = True
+else:
+    fname_middle = ""
+
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{output_prefix}_ggml-{fname_middle}model-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="clip")
+
+fout.add_bool("clip.has_text_encoder", has_text_encoder)
+fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
+fout.add_bool("clip.has_llava_projector", has_llava_projector)
+fout.add_file_type(ftype)
+model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
+fout.add_name(model_name)
+if args.text_only:
+    fout.add_description("text-only CLIP model")
+elif args.vision_only and not has_llava_projector:
+    fout.add_description("vision-only CLIP model")
+elif has_llava_projector:
+    fout.add_description("image encoder for LLaVA")
+else:
+    fout.add_description("two-tower CLIP model")
+
+if has_text_encoder:
+    # text_model hparams
+    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
+    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
+    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
+    fout.add_token_list(tokens)
+
+if has_vision_encoder:
+    # vision_model hparams
+    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
+    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
+    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
+    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
+    fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
+    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
+    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
+    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
+
+    image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
+    image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
+    fout.add_array("clip.vision.image_mean", image_mean)
+    fout.add_array("clip.vision.image_std", image_std)
+
+use_gelu = v_hparams["hidden_act"] == "gelu"
+fout.add_bool("clip.use_gelu", use_gelu)
+
+
+    
+
+if has_llava_projector:
+    model.vision_model.encoder.layers.pop(-1)
+    projector = torch.load(args.llava_projector)
+    weight = projector["model.mm_projector.weight"].cpu().squeeze().float().numpy().astype(np.float16)
+    bias = projector['model.mm_projector.bias'].cpu().squeeze().float().numpy().astype(np.float32)
+    fout.add_tensor("llava_projector.weight", weight)
+    fout.add_tensor("llava_projector.bias", bias)
+    print("Projector tensors added\n")
+
+
+list_vars = model.state_dict()
+for name, data in list_vars.items():
+    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
+        # we don't need this
+        print(f"skipping parameter: {name}")
+        continue
+
+    name = get_tensor_name(name)
+    data = data.squeeze().numpy()
+    
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if n_dims == 4:
+        print(f"tensor {name} is always saved in f16")
+        data = data.astype(np.float16)
+        ftype_cur = 1
+    elif ftype == 1:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+    fout.add_tensor(name, data)
+    
+        
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
diff --git a/models/llava_surgery.py b/models/llava_surgery.py
new file mode 100644
index 0000000..a97cc06
--- /dev/null
+++ b/models/llava_surgery.py
@@ -0,0 +1,63 @@
+import argparse
+from llava.model import LlavaLlamaForCausalLM
+from transformers import AutoTokenizer
+from peft import PeftModel
+import torch
+
+dtype = torch.bfloat16
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", help="Path to LLaVA RLHF model")
+ap.add_argument("-o", "--output", help="Output directory to save the merged file")
+args = ap.parse_args()
+
+model_path = f"{args.model}/sft_model"
+lora_path = f"{args.model}/rlhf_lora_adapter_model"
+save_path = args.output
+
+model = LlavaLlamaForCausalLM.from_pretrained(
+    model_path,
+    device_map={"": "cuda:0"},
+    torch_dtype=dtype,
+)
+model = PeftModel.from_pretrained(
+    model,
+    lora_path,
+)
+
+
+model = model.merge_and_unload()
+
+model.save_pretrained(save_path)
+
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+tokenizer.save_pretrained(save_path)
+
+del model
+del tokenizer
+
+
+# Load the checkpoint
+checkpoint = torch.load(f"{save_path}/pytorch_model-00002-of-00002.bin")
+
+# Extract the tensors we want
+mm_projector_weight = checkpoint['model.mm_projector.weight']
+mm_projector_bias = checkpoint['model.mm_projector.bias']
+
+# Remove the tensors from the checkpoint
+del checkpoint['model.mm_projector.weight']
+del checkpoint['model.mm_projector.bias']
+
+# Create a dictionary with the original names as keys
+mm_projector = {
+    'model.mm_projector.weight': mm_projector_weight,
+    'model.mm_projector.bias': mm_projector_bias
+}
+
+# Save the combined dictionary using torch.save
+torch.save(mm_projector, "projector.pt")
+
+# Save the rest of the model with the same original name
+torch.save(checkpoint, "./llava-7b-rlhf-merged/pytorch_model-00002-of-00002.bin")
+
+Print("Operation complete!")
diff --git a/models/requirements.txt b/models/requirements.txt
new file mode 100644
index 0000000..a753aab
--- /dev/null
+++ b/models/requirements.txt
@@ -0,0 +1,4 @@
+torch
+transformers
+peft
+gguf