Skip to content

Commit d34633d

Browse files
authored
clip : support more quantization types (#4846)
Uses ggml functions instead of hardcoded names and adds support to quantize into the modern Q-K variants. This is just the bare minimum to get k-types working - a more refined choice of types would be needed to get best quality on low quantizations. I ran a few tests, it doesn't break anything I could notice and a Q6_K ViT works almost as well as Q8_0 but 3 times the inference speed.
1 parent 4f56458 commit d34633d

File tree

1 file changed

+24
-38
lines changed

1 file changed

+24
-38
lines changed

examples/llava/clip.cpp

Lines changed: 24 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -126,24 +126,7 @@ static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::str
126126
}
127127

128128
static std::string get_ftype(int ftype) {
129-
switch (ftype) {
130-
case 0:
131-
return "f32";
132-
case 1:
133-
return "f16";
134-
case 2:
135-
return "q4_0";
136-
case 3:
137-
return "q4_1";
138-
case 6:
139-
return "q5_0";
140-
case 7:
141-
return "q5_1";
142-
case 8:
143-
return "q8_0";
144-
default:
145-
throw std::runtime_error(format("%s: Unrecognized file type: %d\n", __func__, ftype));
146-
}
129+
return ggml_type_name(static_cast<ggml_type>(ftype));
147130
}
148131

149132
//
@@ -533,6 +516,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
533516
buffer_size += n_tensors * 128 /* CLIP PADDING */;
534517

535518
clip_ctx * new_clip = new clip_ctx;
519+
536520
#ifdef GGML_USE_CUBLAS
537521
new_clip->backend = ggml_backend_cuda_init(0);
538522
printf("%s: CLIP using CUDA backend\n", __func__);
@@ -543,6 +527,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
543527
printf("%s: CLIP using Metal backend\n", __func__);
544528
#endif
545529

530+
546531
if (!new_clip->backend) {
547532
new_clip->backend = ggml_backend_cpu_init();
548533
printf("%s: CLIP using CPU backend\n", __func__);
@@ -931,26 +916,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
931916

932917
ggml_type type = GGML_TYPE_Q4_1;
933918

934-
switch (itype) {
935-
case 2:
936-
type = GGML_TYPE_Q4_0;
937-
break;
938-
case 3:
939-
type = GGML_TYPE_Q4_1;
940-
break;
941-
case 6:
942-
type = GGML_TYPE_Q5_0;
943-
break;
944-
case 7:
945-
type = GGML_TYPE_Q5_1;
946-
break;
947-
case 8:
948-
type = GGML_TYPE_Q8_0;
949-
break;
950-
default:
951-
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
952-
return false;
953-
};
919+
assert(itype < GGML_TYPE_COUNT);
920+
type = static_cast<ggml_type>(itype);
954921

955922
auto * ctx_clip = clip_model_load(fname_inp, 2);
956923

@@ -1010,6 +977,10 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
1010977

1011978
if (quantize) {
1012979
new_type = type;
980+
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
981+
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
982+
// fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
983+
}
1013984
const size_t n_elms = ggml_nelements(cur);
1014985
float * f32_data;
1015986

@@ -1054,6 +1025,21 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
10541025
case GGML_TYPE_Q8_0: {
10551026
new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
10561027
} break;
1028+
case GGML_TYPE_Q2_K: {
1029+
new_size = ggml_quantize_q2_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
1030+
} break;
1031+
case GGML_TYPE_Q3_K: {
1032+
new_size = ggml_quantize_q3_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
1033+
} break;
1034+
case GGML_TYPE_Q4_K: {
1035+
new_size = ggml_quantize_q4_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
1036+
} break;
1037+
case GGML_TYPE_Q5_K: {
1038+
new_size = ggml_quantize_q5_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
1039+
} break;
1040+
case GGML_TYPE_Q6_K: {
1041+
new_size = ggml_quantize_q6_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
1042+
} break;
10571043
default: {
10581044
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
10591045
return false;

0 commit comments

Comments
 (0)