Skip to content

feat: reduce CLIP memory usage with no embeddings #768

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 24 additions & 25 deletions clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,9 +544,15 @@ class CLIPEmbeddings : public GGMLBlock {
int64_t embed_dim;
int64_t vocab_size;
int64_t num_positions;
bool force_clip_f32;

void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
enum ggml_type token_wtype = GGML_TYPE_F32;
if (!force_clip_f32) {
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
if (tensor_type != tensor_types.end())
token_wtype = tensor_type->second;
}
enum ggml_type position_wtype = GGML_TYPE_F32;

params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
Expand All @@ -556,10 +562,12 @@ class CLIPEmbeddings : public GGMLBlock {
public:
CLIPEmbeddings(int64_t embed_dim,
int64_t vocab_size = 49408,
int64_t num_positions = 77)
int64_t num_positions = 77,
bool force_clip_f32 = false)
: embed_dim(embed_dim),
vocab_size(vocab_size),
num_positions(num_positions) {
num_positions(num_positions),
force_clip_f32(force_clip_f32) {
}

struct ggml_tensor* get_token_embed_weight() {
Expand Down Expand Up @@ -674,12 +682,11 @@ class CLIPTextModel : public GGMLBlock {
int32_t n_head = 12;
int32_t n_layer = 12; // num_hidden_layers
int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14
int32_t clip_skip = -1;
bool with_final_ln = true;

CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true,
int clip_skip_value = -1)
bool force_clip_f32 = false)
: version(version), with_final_ln(with_final_ln) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1024;
Expand All @@ -692,20 +699,12 @@ class CLIPTextModel : public GGMLBlock {
n_head = 20;
n_layer = 32;
}
set_clip_skip(clip_skip_value);

blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
}

void set_clip_skip(int skip) {
if (skip <= 0) {
skip = -1;
}
clip_skip = skip;
}

struct ggml_tensor* get_token_embed_weight() {
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
return embeddings->get_token_embed_weight();
Expand All @@ -715,7 +714,8 @@ class CLIPTextModel : public GGMLBlock {
struct ggml_tensor* input_ids,
struct ggml_tensor* tkn_embeddings,
size_t max_token_idx = 0,
bool return_pooled = false) {
bool return_pooled = false,
int clip_skip = -1) {
// input_ids: [N, n_token]
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
auto encoder = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
Expand Down Expand Up @@ -872,19 +872,15 @@ struct CLIPTextModelRunner : public GGMLRunner {
const std::string prefix,
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true,
int clip_skip_value = -1)
: GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) {
bool force_clip_f32 = false)
: GGMLRunner(backend), model(version, with_final_ln, force_clip_f32) {
model.init(params_ctx, tensor_types, prefix);
}

std::string get_desc() {
return "clip";
}

void set_clip_skip(int clip_skip) {
model.set_clip_skip(clip_skip);
}

void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
model.get_param_tensors(tensors, prefix);
}
Expand All @@ -893,22 +889,24 @@ struct CLIPTextModelRunner : public GGMLRunner {
struct ggml_tensor* input_ids,
struct ggml_tensor* embeddings,
size_t max_token_idx = 0,
bool return_pooled = false) {
bool return_pooled = false,
int clip_skip = -1) {
size_t N = input_ids->ne[1];
size_t n_token = input_ids->ne[0];
if (input_ids->ne[0] > model.n_token) {
GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
}

return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled);
return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
}

struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
int num_custom_embeddings = 0,
void* custom_embeddings_data = NULL,
size_t max_token_idx = 0,
bool return_pooled = false) {
bool return_pooled = false,
int clip_skip = -1) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);

input_ids = to_backend(input_ids);
Expand All @@ -927,7 +925,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
}

struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled);
struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);

ggml_build_forward_expand(gf, hidden_states);

Expand All @@ -940,10 +938,11 @@ struct CLIPTextModelRunner : public GGMLRunner {
void* custom_embeddings_data,
size_t max_token_idx,
bool return_pooled,
int clip_skip,
ggml_tensor** output,
ggml_context* output_ctx = NULL) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
};
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
}
Expand Down
74 changes: 26 additions & 48 deletions conditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,30 +60,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
const String2GGMLType& tensor_types,
const std::string& embd_dir,
SDVersion version = VERSION_SD1,
PMVersion pv = PM_VERSION_1,
int clip_skip = -1)
PMVersion pv = PM_VERSION_1)
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
bool force_clip_f32 = embd_dir.size() > 0;
if (sd_version_is_sd1(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
} else if (sd_version_is_sd2(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
} else if (sd_version_is_sdxl(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
}
set_clip_skip(clip_skip);
}

void set_clip_skip(int clip_skip) {
if (clip_skip <= 0) {
clip_skip = 1;
if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
clip_skip = 2;
}
}
text_model->set_clip_skip(clip_skip);
if (sd_version_is_sdxl(version)) {
text_model2->set_clip_skip(clip_skip);
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
}
}

Expand Down Expand Up @@ -411,7 +397,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
int height,
int adm_in_channels = -1,
bool force_zero_embeddings = false) {
set_clip_skip(clip_skip);
if (clip_skip <= 0) {
clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1;
}

int64_t t0 = ggml_time_ms();
struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size]
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
Expand Down Expand Up @@ -454,6 +443,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
token_embed_custom.data(),
max_token_idx,
false,
clip_skip,
&chunk_hidden_states1,
work_ctx);
if (sd_version_is_sdxl(version)) {
Expand All @@ -463,6 +453,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
token_embed_custom.data(),
max_token_idx,
false,
clip_skip,
&chunk_hidden_states2, work_ctx);
// concat
chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
Expand All @@ -474,6 +465,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
token_embed_custom.data(),
max_token_idx,
true,
clip_skip,
&pooled,
work_ctx);
}
Expand Down Expand Up @@ -663,21 +655,11 @@ struct SD3CLIPEmbedder : public Conditioner {
std::shared_ptr<T5Runner> t5;

SD3CLIPEmbedder(ggml_backend_t backend,
const String2GGMLType& tensor_types = {},
int clip_skip = -1)
const String2GGMLType& tensor_types = {})
: clip_g_tokenizer(0) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
set_clip_skip(clip_skip);
}

void set_clip_skip(int clip_skip) {
if (clip_skip <= 0) {
clip_skip = 2;
}
clip_l->set_clip_skip(clip_skip);
clip_g->set_clip_skip(clip_skip);
}

void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
Expand Down Expand Up @@ -774,7 +756,9 @@ struct SD3CLIPEmbedder : public Conditioner {
std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
int clip_skip,
bool force_zero_embeddings = false) {
set_clip_skip(clip_skip);
if (clip_skip <= 0) {
clip_skip = 2;
}
auto& clip_l_tokens = token_and_weights[0].first;
auto& clip_l_weights = token_and_weights[0].second;
auto& clip_g_tokens = token_and_weights[1].first;
Expand Down Expand Up @@ -812,6 +796,7 @@ struct SD3CLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
false,
clip_skip,
&chunk_hidden_states_l,
work_ctx);
{
Expand Down Expand Up @@ -839,6 +824,7 @@ struct SD3CLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
true,
clip_skip,
&pooled_l,
work_ctx);
}
Expand All @@ -860,6 +846,7 @@ struct SD3CLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
false,
clip_skip,
&chunk_hidden_states_g,
work_ctx);

Expand Down Expand Up @@ -888,6 +875,7 @@ struct SD3CLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
true,
clip_skip,
&pooled_g,
work_ctx);
}
Expand Down Expand Up @@ -1010,18 +998,9 @@ struct FluxCLIPEmbedder : public Conditioner {
size_t chunk_len = 256;

FluxCLIPEmbedder(ggml_backend_t backend,
const String2GGMLType& tensor_types = {},
int clip_skip = -1) {
const String2GGMLType& tensor_types = {}) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
set_clip_skip(clip_skip);
}

void set_clip_skip(int clip_skip) {
if (clip_skip <= 0) {
clip_skip = 2;
}
clip_l->set_clip_skip(clip_skip);
}

void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
Expand Down Expand Up @@ -1102,7 +1081,9 @@ struct FluxCLIPEmbedder : public Conditioner {
std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
int clip_skip,
bool force_zero_embeddings = false) {
set_clip_skip(clip_skip);
if (clip_skip <= 0) {
clip_skip = 2;
}
auto& clip_l_tokens = token_and_weights[0].first;
auto& clip_l_weights = token_and_weights[0].second;
auto& t5_tokens = token_and_weights[1].first;
Expand Down Expand Up @@ -1136,6 +1117,7 @@ struct FluxCLIPEmbedder : public Conditioner {
NULL,
max_token_idx,
true,
clip_skip,
&pooled,
work_ctx);
}
Expand Down Expand Up @@ -1232,16 +1214,12 @@ struct PixArtCLIPEmbedder : public Conditioner {

PixArtCLIPEmbedder(ggml_backend_t backend,
const String2GGMLType& tensor_types = {},
int clip_skip = -1,
bool use_mask = false,
int mask_pad = 1)
: use_mask(use_mask), mask_pad(mask_pad) {
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
}

void set_clip_skip(int clip_skip) {
}

void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
}
Expand Down
1 change: 0 additions & 1 deletion stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,6 @@ class StableDiffusionGGML {
if (is_chroma) {
cond_stage_model = std::make_shared<PixArtCLIPEmbedder>(clip_backend,
model_loader.tensor_storages_types,
-1,
sd_ctx_params->chroma_use_t5_mask,
sd_ctx_params->chroma_t5_mask_pad);
} else {
Expand Down
Loading