@@ -126,24 +126,7 @@ static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::str
126
126
}
127
127
128
128
static std::string get_ftype (int ftype) {
129
- switch (ftype) {
130
- case 0 :
131
- return " f32" ;
132
- case 1 :
133
- return " f16" ;
134
- case 2 :
135
- return " q4_0" ;
136
- case 3 :
137
- return " q4_1" ;
138
- case 6 :
139
- return " q5_0" ;
140
- case 7 :
141
- return " q5_1" ;
142
- case 8 :
143
- return " q8_0" ;
144
- default :
145
- throw std::runtime_error (format (" %s: Unrecognized file type: %d\n " , __func__, ftype));
146
- }
129
+ return ggml_type_name (static_cast <ggml_type>(ftype));
147
130
}
148
131
149
132
//
@@ -533,6 +516,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
533
516
buffer_size += n_tensors * 128 /* CLIP PADDING */ ;
534
517
535
518
clip_ctx * new_clip = new clip_ctx;
519
+
536
520
#ifdef GGML_USE_CUBLAS
537
521
new_clip->backend = ggml_backend_cuda_init (0 );
538
522
printf (" %s: CLIP using CUDA backend\n " , __func__);
@@ -543,6 +527,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
543
527
printf (" %s: CLIP using Metal backend\n " , __func__);
544
528
#endif
545
529
530
+
546
531
if (!new_clip->backend ) {
547
532
new_clip->backend = ggml_backend_cpu_init ();
548
533
printf (" %s: CLIP using CPU backend\n " , __func__);
@@ -931,26 +916,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
931
916
932
917
ggml_type type = GGML_TYPE_Q4_1;
933
918
934
- switch (itype) {
935
- case 2 :
936
- type = GGML_TYPE_Q4_0;
937
- break ;
938
- case 3 :
939
- type = GGML_TYPE_Q4_1;
940
- break ;
941
- case 6 :
942
- type = GGML_TYPE_Q5_0;
943
- break ;
944
- case 7 :
945
- type = GGML_TYPE_Q5_1;
946
- break ;
947
- case 8 :
948
- type = GGML_TYPE_Q8_0;
949
- break ;
950
- default :
951
- fprintf (stderr, " %s: invalid quantization type %d\n " , __func__, itype);
952
- return false ;
953
- };
919
+ assert (itype < GGML_TYPE_COUNT);
920
+ type = static_cast <ggml_type>(itype);
954
921
955
922
auto * ctx_clip = clip_model_load (fname_inp, 2 );
956
923
@@ -1010,6 +977,10 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
1010
977
1011
978
if (quantize) {
1012
979
new_type = type;
980
+ if (new_type >= GGML_TYPE_Q2_K && name.find (" embd" ) != std::string::npos) {
981
+ new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
982
+ // fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
983
+ }
1013
984
const size_t n_elms = ggml_nelements (cur);
1014
985
float * f32_data;
1015
986
@@ -1054,6 +1025,21 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
1054
1025
case GGML_TYPE_Q8_0: {
1055
1026
new_size = ggml_quantize_q8_0 (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1056
1027
} break ;
1028
+ case GGML_TYPE_Q2_K: {
1029
+ new_size = ggml_quantize_q2_K (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1030
+ } break ;
1031
+ case GGML_TYPE_Q3_K: {
1032
+ new_size = ggml_quantize_q3_K (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1033
+ } break ;
1034
+ case GGML_TYPE_Q4_K: {
1035
+ new_size = ggml_quantize_q4_K (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1036
+ } break ;
1037
+ case GGML_TYPE_Q5_K: {
1038
+ new_size = ggml_quantize_q5_K (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1039
+ } break ;
1040
+ case GGML_TYPE_Q6_K: {
1041
+ new_size = ggml_quantize_q6_K (f32_data, new_data, n_elms, cur->ne [0 ], hist_cur.data ());
1042
+ } break ;
1057
1043
default : {
1058
1044
fprintf (stderr, " %s: unsupported quantization type %d\n " , __func__, new_type);
1059
1045
return false ;
0 commit comments