leejet · wbruna · Aug 14, 2025 · Aug 14, 2025
diff --git a/clip.hpp b/clip.hpp
@@ -544,9 +544,15 @@ class CLIPEmbeddings : public GGMLBlock {
     int64_t embed_dim;
     int64_t vocab_size;
     int64_t num_positions;
+    bool force_clip_f32;
 
     void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
         enum ggml_type token_wtype    = GGML_TYPE_F32;
+        if (!force_clip_f32) {
+            auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
+            if (tensor_type != tensor_types.end())
+                token_wtype = tensor_type->second;
+        }
         enum ggml_type position_wtype = GGML_TYPE_F32;
 
         params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
@@ -556,10 +562,12 @@ class CLIPEmbeddings : public GGMLBlock {
 public:
     CLIPEmbeddings(int64_t embed_dim,
                    int64_t vocab_size    = 49408,
-                   int64_t num_positions = 77)
+                   int64_t num_positions = 77,
+                   bool force_clip_f32   = false)
         : embed_dim(embed_dim),
           vocab_size(vocab_size),
-          num_positions(num_positions) {
+          num_positions(num_positions),
+          force_clip_f32(force_clip_f32) {
     }
 
     struct ggml_tensor* get_token_embed_weight() {
@@ -674,12 +682,11 @@ class CLIPTextModel : public GGMLBlock {
     int32_t n_head            = 12;
     int32_t n_layer           = 12;    // num_hidden_layers
     int32_t projection_dim    = 1280;  // only for OPEN_CLIP_VIT_BIGG_14
-    int32_t clip_skip         = -1;
     bool with_final_ln        = true;
 
     CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
                   bool with_final_ln  = true,
-                  int clip_skip_value = -1)
+                  bool force_clip_f32 = false)
         : version(version), with_final_ln(with_final_ln) {
         if (version == OPEN_CLIP_VIT_H_14) {
             hidden_size       = 1024;
@@ -692,20 +699,12 @@ class CLIPTextModel : public GGMLBlock {
             n_head            = 20;
             n_layer           = 32;
         }
-        set_clip_skip(clip_skip_value);
 
-        blocks["embeddings"]       = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
+        blocks["embeddings"]       = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
         blocks["encoder"]          = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
         blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
     }
 
-    void set_clip_skip(int skip) {
-        if (skip <= 0) {
-            skip = -1;
-        }
-        clip_skip = skip;
-    }
-
     struct ggml_tensor* get_token_embed_weight() {
         auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
         return embeddings->get_token_embed_weight();
@@ -715,7 +714,8 @@ class CLIPTextModel : public GGMLBlock {
                                 struct ggml_tensor* input_ids,
                                 struct ggml_tensor* tkn_embeddings,
                                 size_t max_token_idx = 0,
-                                bool return_pooled   = false) {
+                                bool return_pooled   = false,
+                                int clip_skip        = -1) {
         // input_ids: [N, n_token]
         auto embeddings       = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
         auto encoder          = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
@@ -872,19 +872,15 @@ struct CLIPTextModelRunner : public GGMLRunner {
                         const std::string prefix,
                         CLIPVersion version = OPENAI_CLIP_VIT_L_14,
                         bool with_final_ln  = true,
-                        int clip_skip_value = -1)
-        : GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) {
+                        bool force_clip_f32 = false)
+        : GGMLRunner(backend), model(version, with_final_ln, force_clip_f32) {
         model.init(params_ctx, tensor_types, prefix);
     }
 
     std::string get_desc() {
         return "clip";
     }
 
-    void set_clip_skip(int clip_skip) {
-        model.set_clip_skip(clip_skip);
-    }
-
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
         model.get_param_tensors(tensors, prefix);
     }
@@ -893,22 +889,24 @@ struct CLIPTextModelRunner : public GGMLRunner {
                                 struct ggml_tensor* input_ids,
                                 struct ggml_tensor* embeddings,
                                 size_t max_token_idx = 0,
-                                bool return_pooled   = false) {
+                                bool return_pooled   = false,
+                                int clip_skip        = -1) {
         size_t N       = input_ids->ne[1];
         size_t n_token = input_ids->ne[0];
         if (input_ids->ne[0] > model.n_token) {
             GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
             input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
         }
 
-        return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled);
+        return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
     }
 
     struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
                                     int num_custom_embeddings    = 0,
                                     void* custom_embeddings_data = NULL,
                                     size_t max_token_idx         = 0,
-                                    bool return_pooled           = false) {
+                                    bool return_pooled           = false,
+                                    int clip_skip                = -1) {
         struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
 
         input_ids = to_backend(input_ids);
@@ -927,7 +925,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
             embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
         }
 
-        struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled);
+        struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
 
         ggml_build_forward_expand(gf, hidden_states);
 
@@ -940,10 +938,11 @@ struct CLIPTextModelRunner : public GGMLRunner {
                  void* custom_embeddings_data,
                  size_t max_token_idx,
                  bool return_pooled,
+                 int clip_skip,
                  ggml_tensor** output,
                  ggml_context* output_ctx = NULL) {
         auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
+            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
         };
         GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
     }

diff --git a/conditioner.hpp b/conditioner.hpp
@@ -60,30 +60,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                       const String2GGMLType& tensor_types,
                                       const std::string& embd_dir,
                                       SDVersion version = VERSION_SD1,
-                                      PMVersion pv      = PM_VERSION_1,
-                                      int clip_skip     = -1)
+                                      PMVersion pv      = PM_VERSION_1)
         : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
+        bool force_clip_f32 = embd_dir.size() > 0;
         if (sd_version_is_sd1(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
         } else if (sd_version_is_sd2(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
         } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
-        }
-        set_clip_skip(clip_skip);
-    }
-
-    void set_clip_skip(int clip_skip) {
-        if (clip_skip <= 0) {
-            clip_skip = 1;
-            if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
-                clip_skip = 2;
-            }
-        }
-        text_model->set_clip_skip(clip_skip);
-        if (sd_version_is_sdxl(version)) {
-            text_model2->set_clip_skip(clip_skip);
+            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
         }
     }
 
@@ -411,7 +397,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                              int height,
                                              int adm_in_channels        = -1,
                                              bool force_zero_embeddings = false) {
-        set_clip_skip(clip_skip);
+        if (clip_skip <= 0) {
+            clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1;
+        }
+
         int64_t t0                               = ggml_time_ms();
         struct ggml_tensor* hidden_states        = NULL;  // [N, n_token, hidden_size]
         struct ggml_tensor* chunk_hidden_states  = NULL;  // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
@@ -454,6 +443,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                     token_embed_custom.data(),
                                     max_token_idx,
                                     false,
+                                    clip_skip,
                                     &chunk_hidden_states1,
                                     work_ctx);
                 if (sd_version_is_sdxl(version)) {
@@ -463,6 +453,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                          token_embed_custom.data(),
                                          max_token_idx,
                                          false,
+                                         clip_skip,
                                          &chunk_hidden_states2, work_ctx);
                     // concat
                     chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
@@ -474,6 +465,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                              token_embed_custom.data(),
                                              max_token_idx,
                                              true,
+                                             clip_skip,
                                              &pooled,
                                              work_ctx);
                     }
@@ -663,21 +655,11 @@ struct SD3CLIPEmbedder : public Conditioner {
     std::shared_ptr<T5Runner> t5;
 
     SD3CLIPEmbedder(ggml_backend_t backend,
-                    const String2GGMLType& tensor_types = {},
-                    int clip_skip                       = -1)
+                    const String2GGMLType& tensor_types = {})
         : clip_g_tokenizer(0) {
         clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
         clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
         t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
-        set_clip_skip(clip_skip);
-    }
-
-    void set_clip_skip(int clip_skip) {
-        if (clip_skip <= 0) {
-            clip_skip = 2;
-        }
-        clip_l->set_clip_skip(clip_skip);
-        clip_g->set_clip_skip(clip_skip);
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@@ -774,7 +756,9 @@ struct SD3CLIPEmbedder : public Conditioner {
                                              std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
                                              int clip_skip,
                                              bool force_zero_embeddings = false) {
-        set_clip_skip(clip_skip);
+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
         auto& clip_l_tokens  = token_and_weights[0].first;
         auto& clip_l_weights = token_and_weights[0].second;
         auto& clip_g_tokens  = token_and_weights[1].first;
@@ -812,6 +796,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                 NULL,
                                 max_token_idx,
                                 false,
+                                clip_skip,
                                 &chunk_hidden_states_l,
                                 work_ctx);
                 {
@@ -839,6 +824,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                     NULL,
                                     max_token_idx,
                                     true,
+                                    clip_skip,
                                     &pooled_l,
                                     work_ctx);
                 }
@@ -860,6 +846,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                 NULL,
                                 max_token_idx,
                                 false,
+                                clip_skip,
                                 &chunk_hidden_states_g,
                                 work_ctx);
 
@@ -888,6 +875,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                     NULL,
                                     max_token_idx,
                                     true,
+                                    clip_skip,
                                     &pooled_g,
                                     work_ctx);
                 }
@@ -1010,18 +998,9 @@ struct FluxCLIPEmbedder : public Conditioner {
     size_t chunk_len = 256;
 
     FluxCLIPEmbedder(ggml_backend_t backend,
-                     const String2GGMLType& tensor_types = {},
-                     int clip_skip                       = -1) {
+                     const String2GGMLType& tensor_types = {}) {
         clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
         t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
-        set_clip_skip(clip_skip);
-    }
-
-    void set_clip_skip(int clip_skip) {
-        if (clip_skip <= 0) {
-            clip_skip = 2;
-        }
-        clip_l->set_clip_skip(clip_skip);
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@@ -1102,7 +1081,9 @@ struct FluxCLIPEmbedder : public Conditioner {
                                              std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
                                              int clip_skip,
                                              bool force_zero_embeddings = false) {
-        set_clip_skip(clip_skip);
+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
         auto& clip_l_tokens  = token_and_weights[0].first;
         auto& clip_l_weights = token_and_weights[0].second;
         auto& t5_tokens      = token_and_weights[1].first;
@@ -1136,6 +1117,7 @@ struct FluxCLIPEmbedder : public Conditioner {
                                 NULL,
                                 max_token_idx,
                                 true,
+                                clip_skip,
                                 &pooled,
                                 work_ctx);
             }
@@ -1232,16 +1214,12 @@ struct PixArtCLIPEmbedder : public Conditioner {
 
     PixArtCLIPEmbedder(ggml_backend_t backend,
                        const String2GGMLType& tensor_types = {},
-                       int clip_skip                       = -1,
                        bool use_mask                       = false,
                        int mask_pad                        = 1)
         : use_mask(use_mask), mask_pad(mask_pad) {
         t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
     }
 
-    void set_clip_skip(int clip_skip) {
-    }
-
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
         t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
     }

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -346,7 +346,6 @@ class StableDiffusionGGML {
                 if (is_chroma) {
                     cond_stage_model = std::make_shared<PixArtCLIPEmbedder>(clip_backend,
                                                                             model_loader.tensor_storages_types,
-                                                                            -1,
                                                                             sd_ctx_params->chroma_use_t5_mask,
                                                                             sd_ctx_params->chroma_t5_mask_pad);
                 } else {