From 09f716d7ad5fbf14d17129cf6c730a340ce13523 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 13 Mar 2025 18:54:14 +0000 Subject: [PATCH 01/26] Add llama_model_quantize_params parameters --- include/llama.h | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/include/llama.h b/include/llama.h index e5286f06162ab..5bf90ecd68dd7 100644 --- a/include/llama.h +++ b/include/llama.h @@ -355,17 +355,25 @@ extern "C" { // model quantization parameters typedef struct llama_model_quantize_params { - int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() - enum llama_ftype ftype; // quantize to this llama_ftype - enum ggml_type output_tensor_type; // output tensor type - enum ggml_type token_embedding_type; // token embeddings tensor type - bool allow_requantize; // allow quantizing non-f32/f16 tensors - bool quantize_output_tensor; // quantize output.weight - bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored - bool pure; // quantize all tensors to the default type - bool keep_split; // quantize to the same number of shards - void * imatrix; // pointer to importance matrix data - void * kv_overrides; // pointer to vector containing overrides + int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() + enum llama_ftype ftype; // quantize to this llama_ftype + enum ggml_type output_tensor_type; // output tensor type + enum ggml_type token_embedding_type; // token embeddings tensor type + bool allow_requantize; // allow quantizing non-f32/f16 tensors + bool quantize_output_tensor; // quantize output.weight + bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored + bool pure; // quantize all tensors to the default type + bool keep_split; // quantize to the same number of shards + void * imatrix; // pointer to importance matrix data + void * kv_overrides; // pointer to vector containing overrides + enum ggml_type attn_q_tensor_type; // attention query tensor type + enum ggml_type attn_k_tensor_type; // attention key tensor type + enum ggml_type attn_v_tensor_type; // attention value tensor type + enum ggml_type attn_qkv_tensor_type; // attention query, key and value tensor type + enum ggml_type attn_output_tensor_type; // attention output tensor type + enum ggml_type ffn_up_tensor_type; // feedforward up tensor type + enum ggml_type ffn_gate_tensor_type; // feedforward gate tensor type + enum ggml_type ffn_down_tensor_type; // feedforward down tensor type } llama_model_quantize_params; typedef struct llama_logit_bias { From ac908af25ccb5eae817c6f43d319404e428c2fb1 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 13 Mar 2025 18:55:47 +0000 Subject: [PATCH 02/26] Add new quantize parameters parsing and validation --- examples/quantize/quantize.cpp | 72 ++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index a4468b1698722..dfdb4b0a40971 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -277,6 +277,78 @@ int main(int argc, char ** argv) { } else { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--attention-q-type") == 0) { + if (arg_idx < argc-1) { + params.attn_q_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_q_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--attention-k-type") == 0) { + if (arg_idx < argc-1) { + params.attn_k_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_k_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--attention-v-type") == 0) { + if (arg_idx < argc-1) { + params.attn_v_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_v_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--attention-qkv-type") == 0) { + if (arg_idx < argc-1) { + params.attn_qkv_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_qkv_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--attention-output-type") == 0) { + if (arg_idx < argc-1) { + params.attn_output_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_output_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--feedforward-up-type") == 0) { + if (arg_idx < argc-1) { + params.ffn_up_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.ffn_up_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--feedforward-gate-type") == 0) { + if (arg_idx < argc-1) { + params.ffn_gate_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.ffn_gate_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--feedforward-down-type") == 0) { + if (arg_idx < argc-1) { + params.ffn_down_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.ffn_down_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--override-kv") == 0) { if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) { usage(argv[0]); From 337d9792e4b37a6f8ba13dcc6755dabd0f8b62a6 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 13 Mar 2025 18:56:02 +0000 Subject: [PATCH 03/26] Update usage --- examples/quantize/quantize.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index dfdb4b0a40971..ad981c73354ab 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -105,7 +105,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp // [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attention-q-type] [--attention-k-type] [--attention-v-type] [--attention-qkv-type] [--attention-output-type] [--feedforward-up-type] [--feedforward-gate-type] [--feedforward-down-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); @@ -114,6 +114,14 @@ static void usage(const char * executable) { printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); + printf(" --attention-q-type ggml_type: use this ggml_type for the attention query tensor\n"); + printf(" --attention-k-type ggml_type: use this ggml_type for the attention key tensor\n"); + printf(" --attention-v-type ggml_type: use this ggml_type for the attention value tensor\n"); + printf(" --attention-qkv-type ggml_type: use this ggml_type for the attention qkv tensor\n"); + printf(" --attention-output-type ggml_type: use this ggml_type for the attention output tensor\n"); + printf(" --feedforward-up-type ggml_type: use this ggml_type for the feedforward up tensor\n"); + printf(" --feedforward-gate-type ggml_type: use this ggml_type for the feedforward gate tensor\n"); + printf(" --feedforward-down-type ggml_type: use this ggml_type for the feedforward down tensor\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); From 6f8d16dcada981e54868cd36fd7350a1806d0abc Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 13 Mar 2025 18:57:34 +0000 Subject: [PATCH 04/26] Add new parameters defaults --- src/llama-quant.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index fb7982655a373..3b26c579adef0 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -914,6 +914,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.keep_split =*/ false, /*.imatrix =*/ nullptr, /*.kv_overrides =*/ nullptr, + /*.attn_q_tensor_type =*/ GGML_TYPE_COUNT, + /*.attn_k_tensor_type =*/ GGML_TYPE_COUNT, + /*.attn_v_tensor_type =*/ GGML_TYPE_COUNT, + /*.attn_qkv_tensor_type =*/ GGML_TYPE_COUNT, + /*.attn_output_tensor_type =*/ GGML_TYPE_COUNT, + /*.ffn_up_tensor_type =*/ GGML_TYPE_COUNT, + /*.ffn_gate_tensor_type =*/ GGML_TYPE_COUNT, + /*.ffn_down_tensor_type =*/ GGML_TYPE_COUNT, }; return result; From 71c9f93e0a7616480da199f37001fc0fb17eaacd Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 13 Mar 2025 18:58:20 +0000 Subject: [PATCH 05/26] Add new quantization parameters logic --- src/llama-quant.cpp | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3b26c579adef0..710de97b1abae 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -776,13 +776,29 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // get more optimal quantization type based on the tensor shape, layer, etc. if (!params->pure && ggml_is_quantized(default_type)) { - new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); - } - if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { - new_type = params->token_embedding_type; - } - if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { - new_type = params->output_tensor_type; + if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { + new_type = params->token_embedding_type; + } else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { + new_type = params->output_tensor_type; + } else if (params->attn_q_tensor_type < GGML_TYPE_COUNT && name.find("attn_q.weight") != std::string::npos) { + new_type = params->attn_q_tensor_type; + } else if (params->attn_k_tensor_type < GGML_TYPE_COUNT && name.find("attn_k.weight") != std::string::npos) { + new_type = params->attn_k_tensor_type; + } else if (params->attn_v_tensor_type < GGML_TYPE_COUNT && name.find("attn_v.weight") != std::string::npos) { + new_type = params->attn_v_tensor_type; + } else if (params->attn_qkv_tensor_type < GGML_TYPE_COUNT && name.find("attn_kqv.weight") != std::string::npos) { + new_type = params->attn_qkv_tensor_type; + } else if (params->attn_output_tensor_type < GGML_TYPE_COUNT && name.find("attn_output.weight") != std::string::npos) { + new_type = params->attn_output_tensor_type; + } else if (params->ffn_up_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up.weight") != std::string::npos) { + new_type = params->ffn_up_tensor_type; + } else if (params->ffn_gate_tensor_type < GGML_TYPE_COUNT && name.find("ffn_gate.weight") != std::string::npos) { + new_type = params->ffn_gate_tensor_type; + } else if (params->ffn_down_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down.weight") != std::string::npos) { + new_type = params->ffn_down_tensor_type; + } else { + new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); + } } // If we've decided to quantize to the same type the tensor is already From 8e18131b53b2cf273d4bdf7ae87af9166d33e895 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 13 Mar 2025 18:54:14 +0000 Subject: [PATCH 06/26] Add llama_model_quantize_params parameters --- include/llama.h | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/include/llama.h b/include/llama.h index e5286f06162ab..5bf90ecd68dd7 100644 --- a/include/llama.h +++ b/include/llama.h @@ -355,17 +355,25 @@ extern "C" { // model quantization parameters typedef struct llama_model_quantize_params { - int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() - enum llama_ftype ftype; // quantize to this llama_ftype - enum ggml_type output_tensor_type; // output tensor type - enum ggml_type token_embedding_type; // token embeddings tensor type - bool allow_requantize; // allow quantizing non-f32/f16 tensors - bool quantize_output_tensor; // quantize output.weight - bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored - bool pure; // quantize all tensors to the default type - bool keep_split; // quantize to the same number of shards - void * imatrix; // pointer to importance matrix data - void * kv_overrides; // pointer to vector containing overrides + int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() + enum llama_ftype ftype; // quantize to this llama_ftype + enum ggml_type output_tensor_type; // output tensor type + enum ggml_type token_embedding_type; // token embeddings tensor type + bool allow_requantize; // allow quantizing non-f32/f16 tensors + bool quantize_output_tensor; // quantize output.weight + bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored + bool pure; // quantize all tensors to the default type + bool keep_split; // quantize to the same number of shards + void * imatrix; // pointer to importance matrix data + void * kv_overrides; // pointer to vector containing overrides + enum ggml_type attn_q_tensor_type; // attention query tensor type + enum ggml_type attn_k_tensor_type; // attention key tensor type + enum ggml_type attn_v_tensor_type; // attention value tensor type + enum ggml_type attn_qkv_tensor_type; // attention query, key and value tensor type + enum ggml_type attn_output_tensor_type; // attention output tensor type + enum ggml_type ffn_up_tensor_type; // feedforward up tensor type + enum ggml_type ffn_gate_tensor_type; // feedforward gate tensor type + enum ggml_type ffn_down_tensor_type; // feedforward down tensor type } llama_model_quantize_params; typedef struct llama_logit_bias { From a77d94701339680b4bc1e681abc0dfa34ffb9582 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 13 Mar 2025 18:55:47 +0000 Subject: [PATCH 07/26] Add new quantize parameters parsing and validation --- examples/quantize/quantize.cpp | 72 ++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index a4468b1698722..dfdb4b0a40971 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -277,6 +277,78 @@ int main(int argc, char ** argv) { } else { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--attention-q-type") == 0) { + if (arg_idx < argc-1) { + params.attn_q_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_q_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--attention-k-type") == 0) { + if (arg_idx < argc-1) { + params.attn_k_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_k_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--attention-v-type") == 0) { + if (arg_idx < argc-1) { + params.attn_v_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_v_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--attention-qkv-type") == 0) { + if (arg_idx < argc-1) { + params.attn_qkv_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_qkv_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--attention-output-type") == 0) { + if (arg_idx < argc-1) { + params.attn_output_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_output_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--feedforward-up-type") == 0) { + if (arg_idx < argc-1) { + params.ffn_up_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.ffn_up_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--feedforward-gate-type") == 0) { + if (arg_idx < argc-1) { + params.ffn_gate_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.ffn_gate_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--feedforward-down-type") == 0) { + if (arg_idx < argc-1) { + params.ffn_down_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.ffn_down_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--override-kv") == 0) { if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) { usage(argv[0]); From 2414eaa9a6704e6aab3b6a1cdbc99bec498b9ffb Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 13 Mar 2025 18:56:02 +0000 Subject: [PATCH 08/26] Update usage --- examples/quantize/quantize.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index dfdb4b0a40971..ad981c73354ab 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -105,7 +105,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp // [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attention-q-type] [--attention-k-type] [--attention-v-type] [--attention-qkv-type] [--attention-output-type] [--feedforward-up-type] [--feedforward-gate-type] [--feedforward-down-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); @@ -114,6 +114,14 @@ static void usage(const char * executable) { printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); + printf(" --attention-q-type ggml_type: use this ggml_type for the attention query tensor\n"); + printf(" --attention-k-type ggml_type: use this ggml_type for the attention key tensor\n"); + printf(" --attention-v-type ggml_type: use this ggml_type for the attention value tensor\n"); + printf(" --attention-qkv-type ggml_type: use this ggml_type for the attention qkv tensor\n"); + printf(" --attention-output-type ggml_type: use this ggml_type for the attention output tensor\n"); + printf(" --feedforward-up-type ggml_type: use this ggml_type for the feedforward up tensor\n"); + printf(" --feedforward-gate-type ggml_type: use this ggml_type for the feedforward gate tensor\n"); + printf(" --feedforward-down-type ggml_type: use this ggml_type for the feedforward down tensor\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); From 0dd66b81e4d37499cd0a92316bcb88cb536dce06 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 13 Mar 2025 18:57:34 +0000 Subject: [PATCH 09/26] Add new parameters defaults --- src/llama-quant.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index fb7982655a373..3b26c579adef0 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -914,6 +914,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.keep_split =*/ false, /*.imatrix =*/ nullptr, /*.kv_overrides =*/ nullptr, + /*.attn_q_tensor_type =*/ GGML_TYPE_COUNT, + /*.attn_k_tensor_type =*/ GGML_TYPE_COUNT, + /*.attn_v_tensor_type =*/ GGML_TYPE_COUNT, + /*.attn_qkv_tensor_type =*/ GGML_TYPE_COUNT, + /*.attn_output_tensor_type =*/ GGML_TYPE_COUNT, + /*.ffn_up_tensor_type =*/ GGML_TYPE_COUNT, + /*.ffn_gate_tensor_type =*/ GGML_TYPE_COUNT, + /*.ffn_down_tensor_type =*/ GGML_TYPE_COUNT, }; return result; From 1d841c675d0e58d1a5d6ebab147553b3178b41d5 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 13 Mar 2025 18:58:20 +0000 Subject: [PATCH 10/26] Add new quantization parameters logic --- src/llama-quant.cpp | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3b26c579adef0..710de97b1abae 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -776,13 +776,29 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // get more optimal quantization type based on the tensor shape, layer, etc. if (!params->pure && ggml_is_quantized(default_type)) { - new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); - } - if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { - new_type = params->token_embedding_type; - } - if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { - new_type = params->output_tensor_type; + if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { + new_type = params->token_embedding_type; + } else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { + new_type = params->output_tensor_type; + } else if (params->attn_q_tensor_type < GGML_TYPE_COUNT && name.find("attn_q.weight") != std::string::npos) { + new_type = params->attn_q_tensor_type; + } else if (params->attn_k_tensor_type < GGML_TYPE_COUNT && name.find("attn_k.weight") != std::string::npos) { + new_type = params->attn_k_tensor_type; + } else if (params->attn_v_tensor_type < GGML_TYPE_COUNT && name.find("attn_v.weight") != std::string::npos) { + new_type = params->attn_v_tensor_type; + } else if (params->attn_qkv_tensor_type < GGML_TYPE_COUNT && name.find("attn_kqv.weight") != std::string::npos) { + new_type = params->attn_qkv_tensor_type; + } else if (params->attn_output_tensor_type < GGML_TYPE_COUNT && name.find("attn_output.weight") != std::string::npos) { + new_type = params->attn_output_tensor_type; + } else if (params->ffn_up_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up.weight") != std::string::npos) { + new_type = params->ffn_up_tensor_type; + } else if (params->ffn_gate_tensor_type < GGML_TYPE_COUNT && name.find("ffn_gate.weight") != std::string::npos) { + new_type = params->ffn_gate_tensor_type; + } else if (params->ffn_down_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down.weight") != std::string::npos) { + new_type = params->ffn_down_tensor_type; + } else { + new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); + } } // If we've decided to quantize to the same type the tensor is already From d86de03ceb81884369ab53ab1a40685f7fe9a373 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 14 Mar 2025 11:57:06 +0000 Subject: [PATCH 11/26] Minor refactoring as per the contributors' coding guidelines --- include/llama.h | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/include/llama.h b/include/llama.h index 5bf90ecd68dd7..08c82625f7ae6 100644 --- a/include/llama.h +++ b/include/llama.h @@ -356,24 +356,24 @@ extern "C" { // model quantization parameters typedef struct llama_model_quantize_params { int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() - enum llama_ftype ftype; // quantize to this llama_ftype - enum ggml_type output_tensor_type; // output tensor type - enum ggml_type token_embedding_type; // token embeddings tensor type - bool allow_requantize; // allow quantizing non-f32/f16 tensors - bool quantize_output_tensor; // quantize output.weight - bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored - bool pure; // quantize all tensors to the default type - bool keep_split; // quantize to the same number of shards - void * imatrix; // pointer to importance matrix data - void * kv_overrides; // pointer to vector containing overrides - enum ggml_type attn_q_tensor_type; // attention query tensor type - enum ggml_type attn_k_tensor_type; // attention key tensor type - enum ggml_type attn_v_tensor_type; // attention value tensor type - enum ggml_type attn_qkv_tensor_type; // attention query, key and value tensor type - enum ggml_type attn_output_tensor_type; // attention output tensor type - enum ggml_type ffn_up_tensor_type; // feedforward up tensor type - enum ggml_type ffn_gate_tensor_type; // feedforward gate tensor type - enum ggml_type ffn_down_tensor_type; // feedforward down tensor type + llama_ftype ftype; // quantize to this llama_ftype + ggml_type output_tensor_type; // output tensor type + ggml_type token_embedding_type; // token embeddings tensor type + bool allow_requantize; // allow quantizing non-f32/f16 tensors + bool quantize_output_tensor; // quantize output.weight + bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored + bool pure; // quantize all tensors to the default type + bool keep_split; // quantize to the same number of shards + void * imatrix; // pointer to importance matrix data + void * kv_overrides; // pointer to vector containing overrides + ggml_type attn_q_tensor_type; // attention query tensor type + ggml_type attn_k_tensor_type; // attention key tensor type + ggml_type attn_v_tensor_type; // attention value tensor type + ggml_type attn_qkv_tensor_type; // attention query, key and value tensor type + ggml_type attn_output_tensor_type; // attention output tensor type + ggml_type ffn_up_tensor_type; // feedforward up tensor type + ggml_type ffn_gate_tensor_type; // feedforward gate tensor type + ggml_type ffn_down_tensor_type; // feedforward down tensor type } llama_model_quantize_params; typedef struct llama_logit_bias { From 99bae5e9297484b2739867568a5900b7832bb2d1 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 14 Mar 2025 12:15:44 +0000 Subject: [PATCH 12/26] Update descriptions to match existing style --- examples/quantize/quantize.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index ad981c73354ab..d9ff48a2fc9f0 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -114,14 +114,14 @@ static void usage(const char * executable) { printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); - printf(" --attention-q-type ggml_type: use this ggml_type for the attention query tensor\n"); - printf(" --attention-k-type ggml_type: use this ggml_type for the attention key tensor\n"); - printf(" --attention-v-type ggml_type: use this ggml_type for the attention value tensor\n"); - printf(" --attention-qkv-type ggml_type: use this ggml_type for the attention qkv tensor\n"); - printf(" --attention-output-type ggml_type: use this ggml_type for the attention output tensor\n"); - printf(" --feedforward-up-type ggml_type: use this ggml_type for the feedforward up tensor\n"); - printf(" --feedforward-gate-type ggml_type: use this ggml_type for the feedforward gate tensor\n"); - printf(" --feedforward-down-type ggml_type: use this ggml_type for the feedforward down tensor\n"); + printf(" --attention-q-type ggml_type: use this ggml_type for the attn_q.weight tensor\n"); + printf(" --attention-k-type ggml_type: use this ggml_type for the attn_k.weight tensor\n"); + printf(" --attention-v-type ggml_type: use this ggml_type for the attn_v.weight tensor\n"); + printf(" --attention-qkv-type ggml_type: use this ggml_type for the attn_qkv.weight tensor\n"); + printf(" --attention-output-type ggml_type: use this ggml_type for the attn_output.weight tensor\n"); + printf(" --feedforward-up-type ggml_type: use this ggml_type for the ffn_up.weight tensor\n"); + printf(" --feedforward-gate-type ggml_type: use this ggml_type for the ffn_gate.weight tensor\n"); + printf(" --feedforward-down-type ggml_type: use this ggml_type for the ffn_down.weight tensor\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); From f97b693a40ab175ebc172565df4b83fd03f0faaa Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 19 Mar 2025 22:40:44 +0000 Subject: [PATCH 13/26] Add llama_model_quantize_params parameters --- include/llama.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/include/llama.h b/include/llama.h index 5e2c05d9bf24f..76fb75e85d126 100644 --- a/include/llama.h +++ b/include/llama.h @@ -366,14 +366,27 @@ extern "C" { bool keep_split; // quantize to the same number of shards void * imatrix; // pointer to importance matrix data void * kv_overrides; // pointer to vector containing overrides + ggml_type attn_qkv_tensor_type; // attention query/key/value tensor type ggml_type attn_q_tensor_type; // attention query tensor type ggml_type attn_k_tensor_type; // attention key tensor type ggml_type attn_v_tensor_type; // attention value tensor type - ggml_type attn_qkv_tensor_type; // attention query, key and value tensor type + ggml_type attn_qa_tensor_type; // attention query a tensor type + ggml_type attn_qb_tensor_type; // attention query b tensor type + ggml_type attn_kva_tensor_type; // attention key/value a tensor type + ggml_type attn_kvb_tensor_type; // attention key/value b tensor type ggml_type attn_output_tensor_type; // attention output tensor type ggml_type ffn_up_tensor_type; // feedforward up tensor type ggml_type ffn_gate_tensor_type; // feedforward gate tensor type ggml_type ffn_down_tensor_type; // feedforward down tensor type + ggml_type ffn_up_exp_tensor_type; // feedforward up expert tensor type + ggml_type ffn_gate_exp_tensor_type; // feedforward gate expert tensor type + ggml_type ffn_down_exp_tensor_type; // feedforward down expert tensor type + ggml_type ffn_up_shexp_tensor_type; // feedforward up shared expert tensor type + ggml_type ffn_gate_shexp_tensor_type; // feedforward gate shared expert tensor type + ggml_type ffn_down_shexp_tensor_type; // feedforward down shared expert tensor type + ggml_type cls_tensor_type; // classifier tensor type + ggml_type cls_output_tensor_type; // classifier output tensor type + } llama_model_quantize_params; typedef struct llama_logit_bias { From f11e3da291f2bf367d4d528640d3f1ebdabcdb3c Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 19 Mar 2025 22:42:27 +0000 Subject: [PATCH 14/26] Add new quantize parameters parsing and validation --- examples/quantize/quantize.cpp | 114 ++++++++++++++++++++++++++++++++- 1 file changed, 111 insertions(+), 3 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index d9ff48a2fc9f0..949dacaef4f8b 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -285,6 +285,15 @@ int main(int argc, char ** argv) { } else { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--attention-qkv-type") == 0) { + if (arg_idx < argc-1) { + params.attn_qkv_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_qkv_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--attention-q-type") == 0) { if (arg_idx < argc-1) { params.attn_q_tensor_type = parse_ggml_type(argv[++arg_idx]); @@ -312,10 +321,37 @@ int main(int argc, char ** argv) { } else { usage(argv[0]); } - } else if (strcmp(argv[arg_idx], "--attention-qkv-type") == 0) { + } else if (strcmp(argv[arg_idx], "--attention-qa-type") == 0) { if (arg_idx < argc-1) { - params.attn_qkv_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.attn_qkv_tensor_type == GGML_TYPE_COUNT) { + params.attn_qa_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_qa_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--attention-qb-type") == 0) { + if (arg_idx < argc-1) { + params.attn_qb_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_qb_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--attention-kva-type") == 0) { + if (arg_idx < argc-1) { + params.attn_kva_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_kva_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--attention-kvb-type") == 0) { + if (arg_idx < argc-1) { + params.attn_kvb_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.attn_kvb_tensor_type == GGML_TYPE_COUNT) { usage(argv[0]); } } else { @@ -357,6 +393,78 @@ int main(int argc, char ** argv) { } else { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--feedforward-up-exp-type") == 0) { + if (arg_idx < argc-1) { + params.ffn_up_exp_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.ffn_up_exp_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--feedforward-gate-exp-type") == 0) { + if (arg_idx < argc-1) { + params.ffn_gate_exp_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.ffn_gate_exp_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--feedforward-down-exp-type") == 0) { + if (arg_idx < argc-1) { + params.ffn_down_exp_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.ffn_down_exp_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--feedforward-up-shexp_type") == 0) { + if (arg_idx < argc-1) { + params.ffn_up_shexp_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.ffn_up_shexp_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--feedforward-gate-shexp-type") == 0) { + if (arg_idx < argc-1) { + params.ffn_gate_shexp_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.ffn_gate_shexp_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--feedforward-down-shexp-type") == 0) { + if (arg_idx < argc-1) { + params.ffn_down_shexp_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.ffn_down_shexp_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--classifier-type") == 0) { + if (arg_idx < argc-1) { + params.cls_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.cls_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--classifier-output-type") == 0) { + if (arg_idx < argc-1) { + params.cls_output_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.cls_output_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } + } else { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--override-kv") == 0) { if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) { usage(argv[0]); From ad1e352425c1bcca1c62b859d8084f39fcb066eb Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 19 Mar 2025 22:42:42 +0000 Subject: [PATCH 15/26] Update usage --- examples/quantize/quantize.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 949dacaef4f8b..5ad9e9502e590 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -105,7 +105,12 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp // [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attention-q-type] [--attention-k-type] [--attention-v-type] [--attention-qkv-type] [--attention-output-type] [--feedforward-up-type] [--feedforward-gate-type] [--feedforward-down-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type]\n", executable); + printf(" [--token-embedding-type] [--attention-qkv-type] [--attention-q-type] [--attention-k-type] [--attention-v-type] [--attention-qa-type]\n"); + printf(" [--attention-qb-type] [--attention-kva-type] [--attention-kvb-type] [--attention-output-type] [--feedforward-up-type] [--feedforward-gate-type]\n"); + printf(" [--feedforward-down-type] [--feedforward-gate-exp-type] [--feedforward-down-exp-type] [--feedforward-up-exp-type] [--feedforward-gate-shexp-type]\n"); + printf(" [--feedforward-down-shexp-type] [--feedforward-up-shexp-type] [--classifier-type] [--classifier-output-type] [--override-kv]\n"); + printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); @@ -114,14 +119,26 @@ static void usage(const char * executable) { printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); + printf(" --attention-qkv-type ggml_type: use this ggml_type for the attn_qkv.weight tensor\n"); printf(" --attention-q-type ggml_type: use this ggml_type for the attn_q.weight tensor\n"); printf(" --attention-k-type ggml_type: use this ggml_type for the attn_k.weight tensor\n"); printf(" --attention-v-type ggml_type: use this ggml_type for the attn_v.weight tensor\n"); - printf(" --attention-qkv-type ggml_type: use this ggml_type for the attn_qkv.weight tensor\n"); + printf(" --attention-qa-type ggml_type: use this ggml_type for the attn_q_a.weight tensor\n"); + printf(" --attention-qb-type ggml_type: use this ggml_type for the attn_q_b.weight tensor\n"); + printf(" --attention-kva-type ggml_type: use this ggml_type for the attn_kv_a_mqa.weight tensor\n"); + printf(" --attention-kvb-type ggml_type: use this ggml_type for the attn_kv_b.weight tensor\n"); printf(" --attention-output-type ggml_type: use this ggml_type for the attn_output.weight tensor\n"); printf(" --feedforward-up-type ggml_type: use this ggml_type for the ffn_up.weight tensor\n"); printf(" --feedforward-gate-type ggml_type: use this ggml_type for the ffn_gate.weight tensor\n"); printf(" --feedforward-down-type ggml_type: use this ggml_type for the ffn_down.weight tensor\n"); + printf(" --feedforward-up-exp-type ggml_type: use this ggml_type for the ffn_up_exp.weight tensor\n"); + printf(" --feedforward-gate-exp-type ggml_type: use this ggml_type for the ffn_gate_exp.weight tensor\n"); + printf(" --feedforward-down-exp-type ggml_type: use this ggml_type for the ffn_down_exp.weight tensor\n"); + printf(" --feedforward-up-shexp-type ggml_type: use this ggml_type for the ffn_up_shexp.weight tensor\n"); + printf(" --feedforward-gate-shexp-type ggml_type: use this ggml_type for the ffn_gate_shexp.weight tensor\n"); + printf(" --feedforward-down-shexp-type ggml_type: use this ggml_type for the ffn_down_shexp.weight tensor\n"); + printf(" --classifier-type ggml_type: use this ggml_type for the cls.weight tensor\n"); + printf(" --classifier-output-type ggml_type: use this ggml_type for the cls.output.weight tensor\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); From 4e5c96a3e0624961f9c59ba1dac9b243a780011e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 19 Mar 2025 22:43:25 +0000 Subject: [PATCH 16/26] Add new parameters defaults --- src/llama-quant.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index b49b02aca7863..4d9b813f362de 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -939,14 +939,26 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.keep_split =*/ false, /*.imatrix =*/ nullptr, /*.kv_overrides =*/ nullptr, + /*.attn_qkv_tensor_type =*/ GGML_TYPE_COUNT, /*.attn_q_tensor_type =*/ GGML_TYPE_COUNT, /*.attn_k_tensor_type =*/ GGML_TYPE_COUNT, /*.attn_v_tensor_type =*/ GGML_TYPE_COUNT, - /*.attn_qkv_tensor_type =*/ GGML_TYPE_COUNT, + /*.attn_qa_tensor_type =*/ GGML_TYPE_COUNT, + /*.attn_qb_tensor_type =*/ GGML_TYPE_COUNT, + /*.attn_kva_tensor_type =*/ GGML_TYPE_COUNT, + /*.attn_kvb_tensor_type =*/ GGML_TYPE_COUNT, /*.attn_output_tensor_type =*/ GGML_TYPE_COUNT, /*.ffn_up_tensor_type =*/ GGML_TYPE_COUNT, /*.ffn_gate_tensor_type =*/ GGML_TYPE_COUNT, /*.ffn_down_tensor_type =*/ GGML_TYPE_COUNT, + /*.ffn_up_exp_tensor_type =*/ GGML_TYPE_COUNT, + /*.ffn_gate_exp_tensor_type =*/ GGML_TYPE_COUNT, + /*.ffn_down_exp_tensor_type =*/ GGML_TYPE_COUNT, + /*.ffn_up_shexp_tensor_type =*/ GGML_TYPE_COUNT, + /*.ffn_gate_shexp_tensor_type =*/ GGML_TYPE_COUNT, + /*.ffn_down_shexp_tensor_type =*/ GGML_TYPE_COUNT, + /*.cls_tensor_type =*/ GGML_TYPE_COUNT, + /*.cls_output_tensor_type =*/ GGML_TYPE_COUNT, }; return result; From 9b3ccb535e0acbc2d34b1f91a4f06efdcfd6270e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 19 Mar 2025 22:43:56 +0000 Subject: [PATCH 17/26] Add new quantization parameters logic --- src/llama-quant.cpp | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4d9b813f362de..467240c696f92 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -789,14 +789,22 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_type = params->token_embedding_type; } else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { new_type = params->output_tensor_type; + } else if (params->attn_qkv_tensor_type < GGML_TYPE_COUNT && name.find("attn_kqv.weight") != std::string::npos) { + new_type = params->attn_qkv_tensor_type; } else if (params->attn_q_tensor_type < GGML_TYPE_COUNT && name.find("attn_q.weight") != std::string::npos) { new_type = params->attn_q_tensor_type; } else if (params->attn_k_tensor_type < GGML_TYPE_COUNT && name.find("attn_k.weight") != std::string::npos) { new_type = params->attn_k_tensor_type; } else if (params->attn_v_tensor_type < GGML_TYPE_COUNT && name.find("attn_v.weight") != std::string::npos) { new_type = params->attn_v_tensor_type; - } else if (params->attn_qkv_tensor_type < GGML_TYPE_COUNT && name.find("attn_kqv.weight") != std::string::npos) { - new_type = params->attn_qkv_tensor_type; + } else if (params->attn_qa_tensor_type < GGML_TYPE_COUNT && name.find("attn_q_a.weight") != std::string::npos) { + new_type = params->attn_qa_tensor_type; + } else if (params->attn_qb_tensor_type < GGML_TYPE_COUNT && name.find("attn_q_b_mqa.weight") != std::string::npos) { + new_type = params->attn_qb_tensor_type; + } else if (params->attn_kva_tensor_type < GGML_TYPE_COUNT && name.find("attn_kv_a_mqa.weight") != std::string::npos) { + new_type = params->attn_kva_tensor_type; + } else if (params->attn_kvb_tensor_type < GGML_TYPE_COUNT && name.find("attn_kv_b.weight") != std::string::npos) { + new_type = params->attn_kvb_tensor_type; } else if (params->attn_output_tensor_type < GGML_TYPE_COUNT && name.find("attn_output.weight") != std::string::npos) { new_type = params->attn_output_tensor_type; } else if (params->ffn_up_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up.weight") != std::string::npos) { @@ -805,6 +813,22 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_type = params->ffn_gate_tensor_type; } else if (params->ffn_down_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down.weight") != std::string::npos) { new_type = params->ffn_down_tensor_type; + } else if (params->ffn_up_exp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up_exps.weight") != std::string::npos) { + new_type = params->ffn_up_exp_tensor_type; + } else if (params->ffn_gate_exp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_gate_exps.weight") != std::string::npos) { + new_type = params->ffn_gate_exp_tensor_type; + } else if (params->ffn_down_exp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down_exps.weight") != std::string::npos) { + new_type = params->ffn_down_exp_tensor_type; + } else if (params->ffn_up_shexp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up_shexp.weight") != std::string::npos) { + new_type = params->ffn_up_shexp_tensor_type; + } else if (params->ffn_gate_shexp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_gate_shexp.weight") != std::string::npos) { + new_type = params->ffn_gate_shexp_tensor_type; + } else if (params->ffn_down_shexp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down_shexp.weight") != std::string::npos) { + new_type = params->ffn_down_shexp_tensor_type; + } else if (params->cls_tensor_type < GGML_TYPE_COUNT && name.find("cls.weight") != std::string::npos) { + new_type = params->cls_tensor_type; + } else if (params->cls_output_tensor_type < GGML_TYPE_COUNT && name.find("cls.output.weight") != std::string::npos) { + new_type = params->cls_output_tensor_type; } else { new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); } From 35f45f19d1fe31e2e7ae70a2b762c54b9122e209 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 19 Mar 2025 23:10:04 +0000 Subject: [PATCH 18/26] Minor refactoring as per the contributors' guidelines --- examples/quantize/quantize.cpp | 3 +-- src/llama-quant.cpp | 16 +++++++--------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 5ad9e9502e590..a87cfd13b24c7 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include struct quant_option { @@ -16,7 +15,7 @@ struct quant_option { std::string desc; }; -static const std::vector QUANT_OPTIONS = { +static const std::vector QUANT_OPTIONS = { { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", }, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 467240c696f92..bfe2e99f169c5 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -5,11 +5,9 @@ #include "llama-model-loader.h" #include -#include #include #include #include -#include #include #include @@ -48,7 +46,7 @@ struct quantize_state_impl { }; static void llama_tensor_dequantize_impl( - struct ggml_tensor * tensor, std::vector> & output, std::vector & workers, + ggml_tensor * tensor, std::vector> & output, std::vector & workers, const size_t nelements, const int nthread ) { if (output.size() < nelements) { @@ -536,7 +534,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: model.load_hparams(ml); model.load_stats (ml); - struct quantize_state_impl qs(model, params); + quantize_state_impl qs(model, params); if (params->only_copy) { ftype = ml.ftype; @@ -661,7 +659,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // populate the original tensors so we get an initial meta data for (const auto * it : tensors) { uint16_t i_split = params->keep_split ? it->idx : 0; - struct ggml_tensor * tensor = it->tensor; + ggml_tensor * tensor = it->tensor; if (!ctx_outs[i_split]) { ctx_outs[i_split].reset(gguf_init_empty()); } @@ -710,7 +708,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_ofstream(0); for (const auto * it : tensors) { const auto & weight = *it; - struct ggml_tensor * tensor = weight.tensor; + ggml_tensor * tensor = weight.tensor; if (weight.idx != cur_split && params->keep_split) { close_ofstream(); new_ofstream(weight.idx); @@ -776,7 +774,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // do not quantize relative position bias (T5) quantize &= name.find("attn_rel_b.weight") == std::string::npos; - enum ggml_type new_type; + ggml_type new_type; void * new_data; size_t new_size; @@ -950,8 +948,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // interface implementation // -struct llama_model_quantize_params llama_model_quantize_default_params() { - struct llama_model_quantize_params result = { +llama_model_quantize_params llama_model_quantize_default_params() { + llama_model_quantize_params result = { /*.nthread =*/ 0, /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, /*.output_tensor_type =*/ GGML_TYPE_COUNT, From 54e13cf69919180816f2e901037124c06c2d4b08 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 29 Mar 2025 12:18:59 +0000 Subject: [PATCH 19/26] Implement general --tensor-type instead of tensor-specific command option --- examples/quantize/quantize.cpp | 301 +++++++++++---------------------- include/llama.h | 22 +-- src/llama-quant.cpp | 89 +++------- 3 files changed, 122 insertions(+), 290 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index a87cfd13b24c7..88c2167b38a20 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -105,11 +105,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type]\n", executable); - printf(" [--token-embedding-type] [--attention-qkv-type] [--attention-q-type] [--attention-k-type] [--attention-v-type] [--attention-qa-type]\n"); - printf(" [--attention-qb-type] [--attention-kva-type] [--attention-kvb-type] [--attention-output-type] [--feedforward-up-type] [--feedforward-gate-type]\n"); - printf(" [--feedforward-down-type] [--feedforward-gate-exp-type] [--feedforward-down-exp-type] [--feedforward-up-exp-type] [--feedforward-gate-shexp-type]\n"); - printf(" [--feedforward-down-shexp-type] [--feedforward-up-shexp-type] [--classifier-type] [--classifier-output-type] [--override-kv]\n"); - printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); + printf(" [--token-embedding-type] [--tensor-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); @@ -118,26 +114,8 @@ static void usage(const char * executable) { printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); - printf(" --attention-qkv-type ggml_type: use this ggml_type for the attn_qkv.weight tensor\n"); - printf(" --attention-q-type ggml_type: use this ggml_type for the attn_q.weight tensor\n"); - printf(" --attention-k-type ggml_type: use this ggml_type for the attn_k.weight tensor\n"); - printf(" --attention-v-type ggml_type: use this ggml_type for the attn_v.weight tensor\n"); - printf(" --attention-qa-type ggml_type: use this ggml_type for the attn_q_a.weight tensor\n"); - printf(" --attention-qb-type ggml_type: use this ggml_type for the attn_q_b.weight tensor\n"); - printf(" --attention-kva-type ggml_type: use this ggml_type for the attn_kv_a_mqa.weight tensor\n"); - printf(" --attention-kvb-type ggml_type: use this ggml_type for the attn_kv_b.weight tensor\n"); - printf(" --attention-output-type ggml_type: use this ggml_type for the attn_output.weight tensor\n"); - printf(" --feedforward-up-type ggml_type: use this ggml_type for the ffn_up.weight tensor\n"); - printf(" --feedforward-gate-type ggml_type: use this ggml_type for the ffn_gate.weight tensor\n"); - printf(" --feedforward-down-type ggml_type: use this ggml_type for the ffn_down.weight tensor\n"); - printf(" --feedforward-up-exp-type ggml_type: use this ggml_type for the ffn_up_exp.weight tensor\n"); - printf(" --feedforward-gate-exp-type ggml_type: use this ggml_type for the ffn_gate_exp.weight tensor\n"); - printf(" --feedforward-down-exp-type ggml_type: use this ggml_type for the ffn_down_exp.weight tensor\n"); - printf(" --feedforward-up-shexp-type ggml_type: use this ggml_type for the ffn_up_shexp.weight tensor\n"); - printf(" --feedforward-gate-shexp-type ggml_type: use this ggml_type for the ffn_gate_shexp.weight tensor\n"); - printf(" --feedforward-down-shexp-type ggml_type: use this ggml_type for the ffn_down_shexp.weight tensor\n"); - printf(" --classifier-type ggml_type: use this ggml_type for the cls.weight tensor\n"); - printf(" --classifier-output-type ggml_type: use this ggml_type for the cls.output.weight tensor\n"); + printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n"); + printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -268,6 +246,95 @@ static ggml_type parse_ggml_type(const char * arg) { return GGML_TYPE_COUNT; } +// Allowed tensors for arbitrary quantization with --tensor-type option +static const std::vector ALLOWED_TENSOR_TYPE = { + "attn_k", + "attn_kv_a_mqa", + "attn_kv_b", + "attn_out", + "attn_q_a", + "attn_q_b", + "attn_q", + "attn_qkv", + "attn_v", + "channel_mix_key", + "channel_mix_receptance", + "channel_mix_value", + "cls_out", + "cls", + "dec_attn_k", + "dec_attn_out", + "dec_attn_q", + "dec_attn_v", + "dec_cross_attn_k", + "dec_cross_attn_out", + "dec_cross_attn_q", + "dec_cross_attn_v", + "ffn_act", + "ffn_down_exp", + "ffn_down_shexp", + "ffn_down", + "ffn_gate_exp", + "ffn_gate_shexp", + "ffn_gate", + "ffn_up_exp", + "ffn_up_shexp", + "ffn_up", + "ssm_in", + "ssm_out", + "time_mix_gate", + "time_mix_key", + "time_mix_output", + "time_mix_receptance", + "time_mix_value", +}; + +// changes to this struct must be replicated in llama-quant.cpp +struct tensor_quantization { + std::string name; + ggml_type quant = GGML_TYPE_COUNT; +}; + +static bool string_parse_tensor_type(const char * data, std::vector & tensor_type) { + const char * sep = strchr(data, '='); + if (sep == nullptr) { + printf("\n%s: malformed tensor type '%s'\n\n", __func__, data); + return false; + } + + const size_t tn_len = sep - data; + if (tn_len == 0) { + printf("\n%s: missing tensor name\n\n", __func__); + return false; + } + + if (const size_t qt_len = strlen(sep); qt_len == 1) { + printf("\n%s: missing quantization type\n\n", __func__); + return false; + } + + std::string tn(data, tn_len); + std::transform(tn.begin(), tn.end(), tn.begin(), tolower); + sep++; + const std::string qt(sep); + + if (find(ALLOWED_TENSOR_TYPE.begin(), ALLOWED_TENSOR_TYPE.end(), tn) == ALLOWED_TENSOR_TYPE.end()) { + printf("\n%s: invalid tensor name '%s'\n\n", __func__, tn.c_str()); + return false; + } + + if (parse_ggml_type(qt.c_str()) == GGML_TYPE_COUNT) { + printf("\n%s: invalid quantization type '%s'\n\n", __func__, qt.c_str()); + return false; + } + + tensor_quantization tqz; + tqz.name = tn; + tqz.quant = parse_ggml_type(qt.c_str()); + tensor_type.emplace_back(std::move(tqz)); + return true; +} + int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); @@ -279,6 +346,7 @@ int main(int argc, char ** argv) { std::string imatrix_file; std::vector included_weights, excluded_weights; std::vector kv_overrides; + std::vector tensor_types; for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { @@ -301,184 +369,8 @@ int main(int argc, char ** argv) { } else { usage(argv[0]); } - } else if (strcmp(argv[arg_idx], "--attention-qkv-type") == 0) { - if (arg_idx < argc-1) { - params.attn_qkv_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.attn_qkv_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--attention-q-type") == 0) { - if (arg_idx < argc-1) { - params.attn_q_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.attn_q_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--attention-k-type") == 0) { - if (arg_idx < argc-1) { - params.attn_k_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.attn_k_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--attention-v-type") == 0) { - if (arg_idx < argc-1) { - params.attn_v_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.attn_v_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--attention-qa-type") == 0) { - if (arg_idx < argc-1) { - params.attn_qa_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.attn_qa_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--attention-qb-type") == 0) { - if (arg_idx < argc-1) { - params.attn_qb_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.attn_qb_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--attention-kva-type") == 0) { - if (arg_idx < argc-1) { - params.attn_kva_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.attn_kva_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--attention-kvb-type") == 0) { - if (arg_idx < argc-1) { - params.attn_kvb_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.attn_kvb_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--attention-output-type") == 0) { - if (arg_idx < argc-1) { - params.attn_output_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.attn_output_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--feedforward-up-type") == 0) { - if (arg_idx < argc-1) { - params.ffn_up_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.ffn_up_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--feedforward-gate-type") == 0) { - if (arg_idx < argc-1) { - params.ffn_gate_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.ffn_gate_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--feedforward-down-type") == 0) { - if (arg_idx < argc-1) { - params.ffn_down_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.ffn_down_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--feedforward-up-exp-type") == 0) { - if (arg_idx < argc-1) { - params.ffn_up_exp_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.ffn_up_exp_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--feedforward-gate-exp-type") == 0) { - if (arg_idx < argc-1) { - params.ffn_gate_exp_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.ffn_gate_exp_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--feedforward-down-exp-type") == 0) { - if (arg_idx < argc-1) { - params.ffn_down_exp_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.ffn_down_exp_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--feedforward-up-shexp_type") == 0) { - if (arg_idx < argc-1) { - params.ffn_up_shexp_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.ffn_up_shexp_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--feedforward-gate-shexp-type") == 0) { - if (arg_idx < argc-1) { - params.ffn_gate_shexp_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.ffn_gate_shexp_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--feedforward-down-shexp-type") == 0) { - if (arg_idx < argc-1) { - params.ffn_down_shexp_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.ffn_down_shexp_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--classifier-type") == 0) { - if (arg_idx < argc-1) { - params.cls_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.cls_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--classifier-output-type") == 0) { - if (arg_idx < argc-1) { - params.cls_output_tensor_type = parse_ggml_type(argv[++arg_idx]); - if (params.cls_output_tensor_type == GGML_TYPE_COUNT) { - usage(argv[0]); - } - } else { + } else if (strcmp(argv[arg_idx], "--tensor-type") == 0) { + if (arg_idx == argc-1 || !string_parse_tensor_type(argv[++arg_idx], tensor_types)) { usage(argv[0]); } } else if (strcmp(argv[arg_idx], "--override-kv") == 0) { @@ -565,6 +457,9 @@ int main(int argc, char ** argv) { kv_overrides.back().key[0] = 0; params.kv_overrides = &kv_overrides; } + if (!tensor_types.empty()) { + params.tensor_types = &tensor_types; + } llama_backend_init(); diff --git a/include/llama.h b/include/llama.h index 76fb75e85d126..261c6a4cd947c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -366,27 +366,7 @@ extern "C" { bool keep_split; // quantize to the same number of shards void * imatrix; // pointer to importance matrix data void * kv_overrides; // pointer to vector containing overrides - ggml_type attn_qkv_tensor_type; // attention query/key/value tensor type - ggml_type attn_q_tensor_type; // attention query tensor type - ggml_type attn_k_tensor_type; // attention key tensor type - ggml_type attn_v_tensor_type; // attention value tensor type - ggml_type attn_qa_tensor_type; // attention query a tensor type - ggml_type attn_qb_tensor_type; // attention query b tensor type - ggml_type attn_kva_tensor_type; // attention key/value a tensor type - ggml_type attn_kvb_tensor_type; // attention key/value b tensor type - ggml_type attn_output_tensor_type; // attention output tensor type - ggml_type ffn_up_tensor_type; // feedforward up tensor type - ggml_type ffn_gate_tensor_type; // feedforward gate tensor type - ggml_type ffn_down_tensor_type; // feedforward down tensor type - ggml_type ffn_up_exp_tensor_type; // feedforward up expert tensor type - ggml_type ffn_gate_exp_tensor_type; // feedforward gate expert tensor type - ggml_type ffn_down_exp_tensor_type; // feedforward down expert tensor type - ggml_type ffn_up_shexp_tensor_type; // feedforward up shared expert tensor type - ggml_type ffn_gate_shexp_tensor_type; // feedforward gate shared expert tensor type - ggml_type ffn_down_shexp_tensor_type; // feedforward down shared expert tensor type - ggml_type cls_tensor_type; // classifier tensor type - ggml_type cls_output_tensor_type; // classifier output tensor type - + void * tensor_types; // pointer to vector containing tensor types } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index bfe2e99f169c5..65c20904ba1a9 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -45,6 +45,12 @@ struct quantize_state_impl { {} }; +// changes to this struct must be replicated in quantize.cpp +struct tensor_quantization { + std::string name; + ggml_type quant = GGML_TYPE_COUNT; +}; + static void llama_tensor_dequantize_impl( ggml_tensor * tensor, std::vector> & output, std::vector & workers, const size_t nelements, const int nthread @@ -783,54 +789,24 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // get more optimal quantization type based on the tensor shape, layer, etc. if (!params->pure && ggml_is_quantized(default_type)) { - if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { - new_type = params->token_embedding_type; - } else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { - new_type = params->output_tensor_type; - } else if (params->attn_qkv_tensor_type < GGML_TYPE_COUNT && name.find("attn_kqv.weight") != std::string::npos) { - new_type = params->attn_qkv_tensor_type; - } else if (params->attn_q_tensor_type < GGML_TYPE_COUNT && name.find("attn_q.weight") != std::string::npos) { - new_type = params->attn_q_tensor_type; - } else if (params->attn_k_tensor_type < GGML_TYPE_COUNT && name.find("attn_k.weight") != std::string::npos) { - new_type = params->attn_k_tensor_type; - } else if (params->attn_v_tensor_type < GGML_TYPE_COUNT && name.find("attn_v.weight") != std::string::npos) { - new_type = params->attn_v_tensor_type; - } else if (params->attn_qa_tensor_type < GGML_TYPE_COUNT && name.find("attn_q_a.weight") != std::string::npos) { - new_type = params->attn_qa_tensor_type; - } else if (params->attn_qb_tensor_type < GGML_TYPE_COUNT && name.find("attn_q_b_mqa.weight") != std::string::npos) { - new_type = params->attn_qb_tensor_type; - } else if (params->attn_kva_tensor_type < GGML_TYPE_COUNT && name.find("attn_kv_a_mqa.weight") != std::string::npos) { - new_type = params->attn_kva_tensor_type; - } else if (params->attn_kvb_tensor_type < GGML_TYPE_COUNT && name.find("attn_kv_b.weight") != std::string::npos) { - new_type = params->attn_kvb_tensor_type; - } else if (params->attn_output_tensor_type < GGML_TYPE_COUNT && name.find("attn_output.weight") != std::string::npos) { - new_type = params->attn_output_tensor_type; - } else if (params->ffn_up_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up.weight") != std::string::npos) { - new_type = params->ffn_up_tensor_type; - } else if (params->ffn_gate_tensor_type < GGML_TYPE_COUNT && name.find("ffn_gate.weight") != std::string::npos) { - new_type = params->ffn_gate_tensor_type; - } else if (params->ffn_down_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down.weight") != std::string::npos) { - new_type = params->ffn_down_tensor_type; - } else if (params->ffn_up_exp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up_exps.weight") != std::string::npos) { - new_type = params->ffn_up_exp_tensor_type; - } else if (params->ffn_gate_exp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_gate_exps.weight") != std::string::npos) { - new_type = params->ffn_gate_exp_tensor_type; - } else if (params->ffn_down_exp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down_exps.weight") != std::string::npos) { - new_type = params->ffn_down_exp_tensor_type; - } else if (params->ffn_up_shexp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up_shexp.weight") != std::string::npos) { - new_type = params->ffn_up_shexp_tensor_type; - } else if (params->ffn_gate_shexp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_gate_shexp.weight") != std::string::npos) { - new_type = params->ffn_gate_shexp_tensor_type; - } else if (params->ffn_down_shexp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down_shexp.weight") != std::string::npos) { - new_type = params->ffn_down_shexp_tensor_type; - } else if (params->cls_tensor_type < GGML_TYPE_COUNT && name.find("cls.weight") != std::string::npos) { - new_type = params->cls_tensor_type; - } else if (params->cls_output_tensor_type < GGML_TYPE_COUNT && name.find("cls.output.weight") != std::string::npos) { - new_type = params->cls_output_tensor_type; - } else { - new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); + new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); + // unless the user specifies a type + if (params->tensor_types) { + const std::vector & tensor_types = *static_cast *>(params->tensor_types); + for (const auto & [name, quant] : tensor_types) { + if (std::string str(tensor->name); str.find(name) != std::string::npos) { + new_type = quant; + break; + } + } } } + if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { + new_type = params->token_embedding_type; + } + if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { + new_type = params->output_tensor_type; + } // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. @@ -961,26 +937,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.keep_split =*/ false, /*.imatrix =*/ nullptr, /*.kv_overrides =*/ nullptr, - /*.attn_qkv_tensor_type =*/ GGML_TYPE_COUNT, - /*.attn_q_tensor_type =*/ GGML_TYPE_COUNT, - /*.attn_k_tensor_type =*/ GGML_TYPE_COUNT, - /*.attn_v_tensor_type =*/ GGML_TYPE_COUNT, - /*.attn_qa_tensor_type =*/ GGML_TYPE_COUNT, - /*.attn_qb_tensor_type =*/ GGML_TYPE_COUNT, - /*.attn_kva_tensor_type =*/ GGML_TYPE_COUNT, - /*.attn_kvb_tensor_type =*/ GGML_TYPE_COUNT, - /*.attn_output_tensor_type =*/ GGML_TYPE_COUNT, - /*.ffn_up_tensor_type =*/ GGML_TYPE_COUNT, - /*.ffn_gate_tensor_type =*/ GGML_TYPE_COUNT, - /*.ffn_down_tensor_type =*/ GGML_TYPE_COUNT, - /*.ffn_up_exp_tensor_type =*/ GGML_TYPE_COUNT, - /*.ffn_gate_exp_tensor_type =*/ GGML_TYPE_COUNT, - /*.ffn_down_exp_tensor_type =*/ GGML_TYPE_COUNT, - /*.ffn_up_shexp_tensor_type =*/ GGML_TYPE_COUNT, - /*.ffn_gate_shexp_tensor_type =*/ GGML_TYPE_COUNT, - /*.ffn_down_shexp_tensor_type =*/ GGML_TYPE_COUNT, - /*.cls_tensor_type =*/ GGML_TYPE_COUNT, - /*.cls_output_tensor_type =*/ GGML_TYPE_COUNT, + /*.tensor_type =*/ nullptr, }; return result; From b3c7db572c44867746249ecb49527ae9b49f24f7 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 30 Mar 2025 07:47:13 +0100 Subject: [PATCH 20/26] Fix implied type bug --- include/llama.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/llama.h b/include/llama.h index d226b5a857e94..e2ca8081e7b04 100644 --- a/include/llama.h +++ b/include/llama.h @@ -356,10 +356,10 @@ extern "C" { // model quantization parameters typedef struct llama_model_quantize_params { - int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() - llama_ftype ftype; // quantize to this llama_ftype - ggml_type output_tensor_type; // output tensor type - ggml_type token_embedding_type; // token embeddings tensor type + int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() + enum llama_ftype ftype; // quantize to this llama_ftype + enum ggml_type output_tensor_type; // output tensor type + enum ggml_type token_embedding_type; // token embeddings tensor type bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored From 625f0ae57b91ea117e5fdb450d644cf5a2c0eb42 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 31 Mar 2025 07:22:02 +0100 Subject: [PATCH 21/26] Restore missing #includes --- examples/quantize/quantize.cpp | 1 + src/llama-quant.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 88c2167b38a20..04e5ce4ef4d60 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include struct quant_option { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 65c20904ba1a9..eb3863e0dacfe 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -5,9 +5,11 @@ #include "llama-model-loader.h" #include +#include #include #include #include +#include #include #include From 2fd0b41f50af5323849f4ed79c34bef69825ddd3 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 1 Apr 2025 22:20:04 +0100 Subject: [PATCH 22/26] Add regex capability for tensor selection --- examples/quantize/quantize.cpp | 10 +++++++++- src/llama-quant.cpp | 8 +++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 04e5ce4ef4d60..f642e6c1ba55d 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -319,7 +319,15 @@ static bool string_parse_tensor_type(const char * data, std::vector #include #include +#include #include #include @@ -795,9 +796,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // unless the user specifies a type if (params->tensor_types) { const std::vector & tensor_types = *static_cast *>(params->tensor_types); - for (const auto & [name, quant] : tensor_types) { - if (std::string str(tensor->name); str.find(name) != std::string::npos) { - new_type = quant; + for (const auto & [tname, qtype] : tensor_types) { + if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) { + LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype)); + new_type = qtype; break; } } From 054ede4eb1448c199bb98df804e16e6c19144f06 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 3 Apr 2025 08:07:49 +0100 Subject: [PATCH 23/26] Refactor function name and update ALLOWED_TENSOR_TYPE --- examples/quantize/quantize.cpp | 37 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index f642e6c1ba55d..ffdcc1092d4be 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -252,35 +252,32 @@ static const std::vector ALLOWED_TENSOR_TYPE = { "attn_k", "attn_kv_a_mqa", "attn_kv_b", - "attn_out", + "attn_o", + "attn_output", + "attn_q", "attn_q_a", "attn_q_b", - "attn_q", "attn_qkv", "attn_v", "channel_mix_key", "channel_mix_receptance", "channel_mix_value", - "cls_out", "cls", - "dec_attn_k", - "dec_attn_out", - "dec_attn_q", - "dec_attn_v", - "dec_cross_attn_k", - "dec_cross_attn_out", - "dec_cross_attn_q", - "dec_cross_attn_v", + "cls.output", + "cross_attn_k", + "cross_attn_o", + "cross_attn_q", + "cross_attn_v", "ffn_act", - "ffn_down_exp", - "ffn_down_shexp", "ffn_down", - "ffn_gate_exp", - "ffn_gate_shexp", + "ffn_down_exps", + "ffn_down_shexp", "ffn_gate", - "ffn_up_exp", - "ffn_up_shexp", + "ffn_gate_exps", + "ffn_gate_shexp", "ffn_up", + "ffn_up_exps", + "ffn_up_shexp", "ssm_in", "ssm_out", "time_mix_gate", @@ -296,7 +293,7 @@ struct tensor_quantization { ggml_type quant = GGML_TYPE_COUNT; }; -static bool string_parse_tensor_type(const char * data, std::vector & tensor_type) { +static bool parse_tensor_type(const char * data, std::vector & tensor_type) { const char * sep = strchr(data, '='); if (sep == nullptr) { printf("\n%s: malformed tensor type '%s'\n\n", __func__, data); @@ -322,7 +319,7 @@ static bool string_parse_tensor_type(const char * data, std::vector Date: Thu, 3 Apr 2025 08:14:09 +0100 Subject: [PATCH 24/26] Add missing #include --- examples/quantize/quantize.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index ffdcc1092d4be..da6a1a467c467 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -9,6 +9,7 @@ #include #include #include +#include struct quant_option { std::string name; From 1acb9f4a841405cbda04e03b4fb8bc039b8e5792 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 3 Apr 2025 23:32:02 +0100 Subject: [PATCH 25/26] Handle edge case when tensor name is cls.output --- examples/quantize/quantize.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index da6a1a467c467..0355311dc5c06 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -319,8 +319,15 @@ static bool parse_tensor_type(const char * data, std::vector Date: Mon, 7 Apr 2025 19:36:29 +0100 Subject: [PATCH 26/26] Minor logging improvement --- src/llama-quant.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 68d9c5bec53d5..dbac9f37f0323 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -798,7 +798,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const std::vector & tensor_types = *static_cast *>(params->tensor_types); for (const auto & [tname, qtype] : tensor_types) { if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) { - LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype)); + if (qtype != new_type) { + LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype)); + } new_type = qtype; break; }