From 09f716d7ad5fbf14d17129cf6c730a340ce13523 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 13 Mar 2025 18:54:14 +0000
Subject: [PATCH 01/26] Add llama_model_quantize_params parameters

---
 include/llama.h | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index e5286f06162ab..5bf90ecd68dd7 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -355,17 +355,25 @@ extern "C" {
 
     // model quantization parameters
     typedef struct llama_model_quantize_params {
-        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype ftype;              // quantize to this llama_ftype
-        enum ggml_type output_tensor_type;   // output tensor type
-        enum ggml_type token_embedding_type; // token embeddings tensor type
-        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
-        bool quantize_output_tensor;         // quantize output.weight
-        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        bool pure;                           // quantize all tensors to the default type
-        bool keep_split;                     // quantize to the same number of shards
-        void * imatrix;                      // pointer to importance matrix data
-        void * kv_overrides;                 // pointer to vector containing overrides
+        int32_t nthread;                            // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype ftype;                     // quantize to this llama_ftype
+        enum ggml_type output_tensor_type;          // output tensor type
+        enum ggml_type token_embedding_type;        // token embeddings tensor type
+        bool allow_requantize;                      // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor;                // quantize output.weight
+        bool only_copy;                             // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                                  // quantize all tensors to the default type
+        bool keep_split;                            // quantize to the same number of shards
+        void * imatrix;                             // pointer to importance matrix data
+        void * kv_overrides;                        // pointer to vector containing overrides
+        enum ggml_type attn_q_tensor_type;          // attention query tensor type
+        enum ggml_type attn_k_tensor_type;          // attention key tensor type
+        enum ggml_type attn_v_tensor_type;          // attention value tensor type
+        enum ggml_type attn_qkv_tensor_type;        // attention query, key and value tensor type
+        enum ggml_type attn_output_tensor_type;     // attention output tensor type
+        enum ggml_type ffn_up_tensor_type;          // feedforward up tensor type
+        enum ggml_type ffn_gate_tensor_type;        // feedforward gate tensor type
+        enum ggml_type ffn_down_tensor_type;        // feedforward down tensor type
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {

From ac908af25ccb5eae817c6f43d319404e428c2fb1 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 13 Mar 2025 18:55:47 +0000
Subject: [PATCH 02/26] Add new quantize parameters parsing and validation

---
 examples/quantize/quantize.cpp | 72 ++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index a4468b1698722..dfdb4b0a40971 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -277,6 +277,78 @@ int main(int argc, char ** argv) {
             } else {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--attention-q-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_q_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_q_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attention-k-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_k_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_k_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attention-v-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_v_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_v_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attention-qkv-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_qkv_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_qkv_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attention-output-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_output_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_output_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--feedforward-up-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_up_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.ffn_up_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--feedforward-gate-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_gate_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.ffn_gate_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--feedforward-down-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_down_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.ffn_down_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
             if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
                 usage(argv[0]);

From 337d9792e4b37a6f8ba13dcc6755dabd0f8b62a6 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 13 Mar 2025 18:56:02 +0000
Subject: [PATCH 03/26] Update usage

---
 examples/quantize/quantize.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index dfdb4b0a40971..ad981c73354ab 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -105,7 +105,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attention-q-type] [--attention-k-type] [--attention-v-type] [--attention-qkv-type] [--attention-output-type] [--feedforward-up-type] [--feedforward-gate-type] [--feedforward-down-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
@@ -114,6 +114,14 @@ static void usage(const char * executable) {
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
     printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
+    printf("  --attention-q-type ggml_type: use this ggml_type for the attention query tensor\n");
+    printf("  --attention-k-type ggml_type: use this ggml_type for the attention key tensor\n");
+    printf("  --attention-v-type ggml_type: use this ggml_type for the attention value tensor\n");
+    printf("  --attention-qkv-type ggml_type: use this ggml_type for the attention qkv tensor\n");
+    printf("  --attention-output-type ggml_type: use this ggml_type for the attention output tensor\n");
+    printf("  --feedforward-up-type ggml_type: use this ggml_type for the feedforward up tensor\n");
+    printf("  --feedforward-gate-type ggml_type: use this ggml_type for the feedforward gate tensor\n");
+    printf("  --feedforward-down-type ggml_type: use this ggml_type for the feedforward down tensor\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");

From 6f8d16dcada981e54868cd36fd7350a1806d0abc Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 13 Mar 2025 18:57:34 +0000
Subject: [PATCH 04/26] Add new parameters defaults

---
 src/llama-quant.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index fb7982655a373..3b26c579adef0 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -914,6 +914,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
         /*.keep_split                  =*/ false,
         /*.imatrix                     =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
+        /*.attn_q_tensor_type          =*/ GGML_TYPE_COUNT,
+        /*.attn_k_tensor_type          =*/ GGML_TYPE_COUNT,
+        /*.attn_v_tensor_type          =*/ GGML_TYPE_COUNT,
+        /*.attn_qkv_tensor_type        =*/ GGML_TYPE_COUNT,
+        /*.attn_output_tensor_type     =*/ GGML_TYPE_COUNT,
+        /*.ffn_up_tensor_type          =*/ GGML_TYPE_COUNT,
+        /*.ffn_gate_tensor_type        =*/ GGML_TYPE_COUNT,
+        /*.ffn_down_tensor_type        =*/ GGML_TYPE_COUNT,
     };
 
     return result;

From 71c9f93e0a7616480da199f37001fc0fb17eaacd Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 13 Mar 2025 18:58:20 +0000
Subject: [PATCH 05/26] Add new quantization parameters logic

---
 src/llama-quant.cpp | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3b26c579adef0..710de97b1abae 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -776,13 +776,29 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
             // get more optimal quantization type based on the tensor shape, layer, etc.
             if (!params->pure && ggml_is_quantized(default_type)) {
-                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
-            }
-            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
-                new_type = params->token_embedding_type;
-            }
-            if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
-                new_type = params->output_tensor_type;
+                if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
+                    new_type = params->token_embedding_type;
+                } else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
+                    new_type = params->output_tensor_type;
+                } else if (params->attn_q_tensor_type < GGML_TYPE_COUNT && name.find("attn_q.weight") != std::string::npos) {
+                    new_type = params->attn_q_tensor_type;
+                } else if (params->attn_k_tensor_type < GGML_TYPE_COUNT && name.find("attn_k.weight") != std::string::npos) {
+                    new_type = params->attn_k_tensor_type;
+                } else if (params->attn_v_tensor_type < GGML_TYPE_COUNT && name.find("attn_v.weight") != std::string::npos) {
+                    new_type = params->attn_v_tensor_type;
+                } else if (params->attn_qkv_tensor_type < GGML_TYPE_COUNT && name.find("attn_kqv.weight") != std::string::npos) {
+                    new_type = params->attn_qkv_tensor_type;
+                } else if (params->attn_output_tensor_type < GGML_TYPE_COUNT && name.find("attn_output.weight") != std::string::npos) {
+                    new_type = params->attn_output_tensor_type;
+                } else if (params->ffn_up_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up.weight") != std::string::npos) {
+                    new_type = params->ffn_up_tensor_type;
+                } else if (params->ffn_gate_tensor_type < GGML_TYPE_COUNT && name.find("ffn_gate.weight") != std::string::npos) {
+                    new_type = params->ffn_gate_tensor_type;
+                } else if (params->ffn_down_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down.weight") != std::string::npos) {
+                    new_type = params->ffn_down_tensor_type;
+                } else {
+                    new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+                }
             }
 
             // If we've decided to quantize to the same type the tensor is already

From 8e18131b53b2cf273d4bdf7ae87af9166d33e895 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 13 Mar 2025 18:54:14 +0000
Subject: [PATCH 06/26] Add llama_model_quantize_params parameters

---
 include/llama.h | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index e5286f06162ab..5bf90ecd68dd7 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -355,17 +355,25 @@ extern "C" {
 
     // model quantization parameters
     typedef struct llama_model_quantize_params {
-        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype ftype;              // quantize to this llama_ftype
-        enum ggml_type output_tensor_type;   // output tensor type
-        enum ggml_type token_embedding_type; // token embeddings tensor type
-        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
-        bool quantize_output_tensor;         // quantize output.weight
-        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        bool pure;                           // quantize all tensors to the default type
-        bool keep_split;                     // quantize to the same number of shards
-        void * imatrix;                      // pointer to importance matrix data
-        void * kv_overrides;                 // pointer to vector containing overrides
+        int32_t nthread;                            // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype ftype;                     // quantize to this llama_ftype
+        enum ggml_type output_tensor_type;          // output tensor type
+        enum ggml_type token_embedding_type;        // token embeddings tensor type
+        bool allow_requantize;                      // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor;                // quantize output.weight
+        bool only_copy;                             // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                                  // quantize all tensors to the default type
+        bool keep_split;                            // quantize to the same number of shards
+        void * imatrix;                             // pointer to importance matrix data
+        void * kv_overrides;                        // pointer to vector containing overrides
+        enum ggml_type attn_q_tensor_type;          // attention query tensor type
+        enum ggml_type attn_k_tensor_type;          // attention key tensor type
+        enum ggml_type attn_v_tensor_type;          // attention value tensor type
+        enum ggml_type attn_qkv_tensor_type;        // attention query, key and value tensor type
+        enum ggml_type attn_output_tensor_type;     // attention output tensor type
+        enum ggml_type ffn_up_tensor_type;          // feedforward up tensor type
+        enum ggml_type ffn_gate_tensor_type;        // feedforward gate tensor type
+        enum ggml_type ffn_down_tensor_type;        // feedforward down tensor type
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {

From a77d94701339680b4bc1e681abc0dfa34ffb9582 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 13 Mar 2025 18:55:47 +0000
Subject: [PATCH 07/26] Add new quantize parameters parsing and validation

---
 examples/quantize/quantize.cpp | 72 ++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index a4468b1698722..dfdb4b0a40971 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -277,6 +277,78 @@ int main(int argc, char ** argv) {
             } else {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--attention-q-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_q_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_q_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attention-k-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_k_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_k_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attention-v-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_v_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_v_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attention-qkv-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_qkv_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_qkv_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attention-output-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_output_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_output_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--feedforward-up-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_up_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.ffn_up_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--feedforward-gate-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_gate_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.ffn_gate_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--feedforward-down-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_down_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.ffn_down_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
             if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
                 usage(argv[0]);

From 2414eaa9a6704e6aab3b6a1cdbc99bec498b9ffb Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 13 Mar 2025 18:56:02 +0000
Subject: [PATCH 08/26] Update usage

---
 examples/quantize/quantize.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index dfdb4b0a40971..ad981c73354ab 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -105,7 +105,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attention-q-type] [--attention-k-type] [--attention-v-type] [--attention-qkv-type] [--attention-output-type] [--feedforward-up-type] [--feedforward-gate-type] [--feedforward-down-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
@@ -114,6 +114,14 @@ static void usage(const char * executable) {
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
     printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
+    printf("  --attention-q-type ggml_type: use this ggml_type for the attention query tensor\n");
+    printf("  --attention-k-type ggml_type: use this ggml_type for the attention key tensor\n");
+    printf("  --attention-v-type ggml_type: use this ggml_type for the attention value tensor\n");
+    printf("  --attention-qkv-type ggml_type: use this ggml_type for the attention qkv tensor\n");
+    printf("  --attention-output-type ggml_type: use this ggml_type for the attention output tensor\n");
+    printf("  --feedforward-up-type ggml_type: use this ggml_type for the feedforward up tensor\n");
+    printf("  --feedforward-gate-type ggml_type: use this ggml_type for the feedforward gate tensor\n");
+    printf("  --feedforward-down-type ggml_type: use this ggml_type for the feedforward down tensor\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");

From 0dd66b81e4d37499cd0a92316bcb88cb536dce06 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 13 Mar 2025 18:57:34 +0000
Subject: [PATCH 09/26] Add new parameters defaults

---
 src/llama-quant.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index fb7982655a373..3b26c579adef0 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -914,6 +914,14 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
         /*.keep_split                  =*/ false,
         /*.imatrix                     =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
+        /*.attn_q_tensor_type          =*/ GGML_TYPE_COUNT,
+        /*.attn_k_tensor_type          =*/ GGML_TYPE_COUNT,
+        /*.attn_v_tensor_type          =*/ GGML_TYPE_COUNT,
+        /*.attn_qkv_tensor_type        =*/ GGML_TYPE_COUNT,
+        /*.attn_output_tensor_type     =*/ GGML_TYPE_COUNT,
+        /*.ffn_up_tensor_type          =*/ GGML_TYPE_COUNT,
+        /*.ffn_gate_tensor_type        =*/ GGML_TYPE_COUNT,
+        /*.ffn_down_tensor_type        =*/ GGML_TYPE_COUNT,
     };
 
     return result;

From 1d841c675d0e58d1a5d6ebab147553b3178b41d5 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 13 Mar 2025 18:58:20 +0000
Subject: [PATCH 10/26] Add new quantization parameters logic

---
 src/llama-quant.cpp | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3b26c579adef0..710de97b1abae 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -776,13 +776,29 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
             // get more optimal quantization type based on the tensor shape, layer, etc.
             if (!params->pure && ggml_is_quantized(default_type)) {
-                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
-            }
-            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
-                new_type = params->token_embedding_type;
-            }
-            if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
-                new_type = params->output_tensor_type;
+                if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
+                    new_type = params->token_embedding_type;
+                } else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
+                    new_type = params->output_tensor_type;
+                } else if (params->attn_q_tensor_type < GGML_TYPE_COUNT && name.find("attn_q.weight") != std::string::npos) {
+                    new_type = params->attn_q_tensor_type;
+                } else if (params->attn_k_tensor_type < GGML_TYPE_COUNT && name.find("attn_k.weight") != std::string::npos) {
+                    new_type = params->attn_k_tensor_type;
+                } else if (params->attn_v_tensor_type < GGML_TYPE_COUNT && name.find("attn_v.weight") != std::string::npos) {
+                    new_type = params->attn_v_tensor_type;
+                } else if (params->attn_qkv_tensor_type < GGML_TYPE_COUNT && name.find("attn_kqv.weight") != std::string::npos) {
+                    new_type = params->attn_qkv_tensor_type;
+                } else if (params->attn_output_tensor_type < GGML_TYPE_COUNT && name.find("attn_output.weight") != std::string::npos) {
+                    new_type = params->attn_output_tensor_type;
+                } else if (params->ffn_up_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up.weight") != std::string::npos) {
+                    new_type = params->ffn_up_tensor_type;
+                } else if (params->ffn_gate_tensor_type < GGML_TYPE_COUNT && name.find("ffn_gate.weight") != std::string::npos) {
+                    new_type = params->ffn_gate_tensor_type;
+                } else if (params->ffn_down_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down.weight") != std::string::npos) {
+                    new_type = params->ffn_down_tensor_type;
+                } else {
+                    new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+                }
             }
 
             // If we've decided to quantize to the same type the tensor is already

From d86de03ceb81884369ab53ab1a40685f7fe9a373 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 14 Mar 2025 11:57:06 +0000
Subject: [PATCH 11/26] Minor refactoring as per the contributors' coding
 guidelines

---
 include/llama.h | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 5bf90ecd68dd7..08c82625f7ae6 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -356,24 +356,24 @@ extern "C" {
     // model quantization parameters
     typedef struct llama_model_quantize_params {
         int32_t nthread;                            // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype ftype;                     // quantize to this llama_ftype
-        enum ggml_type output_tensor_type;          // output tensor type
-        enum ggml_type token_embedding_type;        // token embeddings tensor type
-        bool allow_requantize;                      // allow quantizing non-f32/f16 tensors
-        bool quantize_output_tensor;                // quantize output.weight
-        bool only_copy;                             // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        bool pure;                                  // quantize all tensors to the default type
-        bool keep_split;                            // quantize to the same number of shards
-        void * imatrix;                             // pointer to importance matrix data
-        void * kv_overrides;                        // pointer to vector containing overrides
-        enum ggml_type attn_q_tensor_type;          // attention query tensor type
-        enum ggml_type attn_k_tensor_type;          // attention key tensor type
-        enum ggml_type attn_v_tensor_type;          // attention value tensor type
-        enum ggml_type attn_qkv_tensor_type;        // attention query, key and value tensor type
-        enum ggml_type attn_output_tensor_type;     // attention output tensor type
-        enum ggml_type ffn_up_tensor_type;          // feedforward up tensor type
-        enum ggml_type ffn_gate_tensor_type;        // feedforward gate tensor type
-        enum ggml_type ffn_down_tensor_type;        // feedforward down tensor type
+        llama_ftype ftype;                    // quantize to this llama_ftype
+        ggml_type output_tensor_type;         // output tensor type
+        ggml_type token_embedding_type;       // token embeddings tensor type
+        bool allow_requantize;                // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor;          // quantize output.weight
+        bool only_copy;                       // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                            // quantize all tensors to the default type
+        bool keep_split;                      // quantize to the same number of shards
+        void * imatrix;                       // pointer to importance matrix data
+        void * kv_overrides;                  // pointer to vector containing overrides
+        ggml_type attn_q_tensor_type;         // attention query tensor type
+        ggml_type attn_k_tensor_type;         // attention key tensor type
+        ggml_type attn_v_tensor_type;         // attention value tensor type
+        ggml_type attn_qkv_tensor_type;       // attention query, key and value tensor type
+        ggml_type attn_output_tensor_type;    // attention output tensor type
+        ggml_type ffn_up_tensor_type;         // feedforward up tensor type
+        ggml_type ffn_gate_tensor_type;       // feedforward gate tensor type
+        ggml_type ffn_down_tensor_type;       // feedforward down tensor type
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {

From 99bae5e9297484b2739867568a5900b7832bb2d1 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 14 Mar 2025 12:15:44 +0000
Subject: [PATCH 12/26] Update descriptions to match existing style

---
 examples/quantize/quantize.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index ad981c73354ab..d9ff48a2fc9f0 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -114,14 +114,14 @@ static void usage(const char * executable) {
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
     printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
-    printf("  --attention-q-type ggml_type: use this ggml_type for the attention query tensor\n");
-    printf("  --attention-k-type ggml_type: use this ggml_type for the attention key tensor\n");
-    printf("  --attention-v-type ggml_type: use this ggml_type for the attention value tensor\n");
-    printf("  --attention-qkv-type ggml_type: use this ggml_type for the attention qkv tensor\n");
-    printf("  --attention-output-type ggml_type: use this ggml_type for the attention output tensor\n");
-    printf("  --feedforward-up-type ggml_type: use this ggml_type for the feedforward up tensor\n");
-    printf("  --feedforward-gate-type ggml_type: use this ggml_type for the feedforward gate tensor\n");
-    printf("  --feedforward-down-type ggml_type: use this ggml_type for the feedforward down tensor\n");
+    printf("  --attention-q-type ggml_type: use this ggml_type for the attn_q.weight tensor\n");
+    printf("  --attention-k-type ggml_type: use this ggml_type for the attn_k.weight tensor\n");
+    printf("  --attention-v-type ggml_type: use this ggml_type for the attn_v.weight tensor\n");
+    printf("  --attention-qkv-type ggml_type: use this ggml_type for the attn_qkv.weight tensor\n");
+    printf("  --attention-output-type ggml_type: use this ggml_type for the attn_output.weight tensor\n");
+    printf("  --feedforward-up-type ggml_type: use this ggml_type for the ffn_up.weight tensor\n");
+    printf("  --feedforward-gate-type ggml_type: use this ggml_type for the ffn_gate.weight tensor\n");
+    printf("  --feedforward-down-type ggml_type: use this ggml_type for the ffn_down.weight tensor\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");

From f97b693a40ab175ebc172565df4b83fd03f0faaa Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 19 Mar 2025 22:40:44 +0000
Subject: [PATCH 13/26] Add llama_model_quantize_params parameters

---
 include/llama.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/llama.h b/include/llama.h
index 5e2c05d9bf24f..76fb75e85d126 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -366,14 +366,27 @@ extern "C" {
         bool keep_split;                      // quantize to the same number of shards
         void * imatrix;                       // pointer to importance matrix data
         void * kv_overrides;                  // pointer to vector containing overrides
+        ggml_type attn_qkv_tensor_type;       // attention query/key/value tensor type
         ggml_type attn_q_tensor_type;         // attention query tensor type
         ggml_type attn_k_tensor_type;         // attention key tensor type
         ggml_type attn_v_tensor_type;         // attention value tensor type
-        ggml_type attn_qkv_tensor_type;       // attention query, key and value tensor type
+        ggml_type attn_qa_tensor_type;        // attention query a tensor type
+        ggml_type attn_qb_tensor_type;        // attention query b tensor type
+        ggml_type attn_kva_tensor_type;       // attention key/value a tensor type
+        ggml_type attn_kvb_tensor_type;       // attention key/value b tensor type
         ggml_type attn_output_tensor_type;    // attention output tensor type
         ggml_type ffn_up_tensor_type;         // feedforward up tensor type
         ggml_type ffn_gate_tensor_type;       // feedforward gate tensor type
         ggml_type ffn_down_tensor_type;       // feedforward down tensor type
+        ggml_type ffn_up_exp_tensor_type;     // feedforward up expert tensor type
+        ggml_type ffn_gate_exp_tensor_type;   // feedforward gate expert tensor type
+        ggml_type ffn_down_exp_tensor_type;   // feedforward down expert tensor type
+        ggml_type ffn_up_shexp_tensor_type;   // feedforward up shared expert tensor type
+        ggml_type ffn_gate_shexp_tensor_type; // feedforward gate shared expert tensor type
+        ggml_type ffn_down_shexp_tensor_type; // feedforward down shared expert tensor type
+        ggml_type cls_tensor_type;            // classifier tensor type
+        ggml_type cls_output_tensor_type;     // classifier output tensor type
+
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {

From f11e3da291f2bf367d4d528640d3f1ebdabcdb3c Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 19 Mar 2025 22:42:27 +0000
Subject: [PATCH 14/26] Add new quantize parameters parsing and validation

---
 examples/quantize/quantize.cpp | 114 ++++++++++++++++++++++++++++++++-
 1 file changed, 111 insertions(+), 3 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index d9ff48a2fc9f0..949dacaef4f8b 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -285,6 +285,15 @@ int main(int argc, char ** argv) {
             } else {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--attention-qkv-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_qkv_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_qkv_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--attention-q-type") == 0) {
             if (arg_idx < argc-1) {
                 params.attn_q_tensor_type = parse_ggml_type(argv[++arg_idx]);
@@ -312,10 +321,37 @@ int main(int argc, char ** argv) {
             } else {
                 usage(argv[0]);
             }
-        } else if (strcmp(argv[arg_idx], "--attention-qkv-type") == 0) {
+        } else if (strcmp(argv[arg_idx], "--attention-qa-type") == 0) {
             if (arg_idx < argc-1) {
-                params.attn_qkv_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.attn_qkv_tensor_type == GGML_TYPE_COUNT) {
+                params.attn_qa_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_qa_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attention-qb-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_qb_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_qb_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attention-kva-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_kva_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_kva_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--attention-kvb-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.attn_kvb_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.attn_kvb_tensor_type == GGML_TYPE_COUNT) {
                     usage(argv[0]);
                 }
             } else {
@@ -357,6 +393,78 @@ int main(int argc, char ** argv) {
             } else {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--feedforward-up-exp-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_up_exp_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.ffn_up_exp_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--feedforward-gate-exp-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_gate_exp_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.ffn_gate_exp_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--feedforward-down-exp-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_down_exp_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.ffn_down_exp_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--feedforward-up-shexp_type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_up_shexp_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.ffn_up_shexp_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--feedforward-gate-shexp-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_gate_shexp_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.ffn_gate_shexp_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--feedforward-down-shexp-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.ffn_down_shexp_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.ffn_down_shexp_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--classifier-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.cls_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.cls_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--classifier-output-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.cls_output_tensor_type = parse_ggml_type(argv[++arg_idx]);
+                if (params.cls_output_tensor_type == GGML_TYPE_COUNT) {
+                    usage(argv[0]);
+                }
+            } else {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
             if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
                 usage(argv[0]);

From ad1e352425c1bcca1c62b859d8084f39fcb066eb Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 19 Mar 2025 22:42:42 +0000
Subject: [PATCH 15/26] Update usage

---
 examples/quantize/quantize.cpp | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 949dacaef4f8b..5ad9e9502e590 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -105,7 +105,12 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--attention-q-type] [--attention-k-type] [--attention-v-type] [--attention-qkv-type] [--attention-output-type] [--feedforward-up-type] [--feedforward-gate-type] [--feedforward-down-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type]\n", executable);
+    printf("       [--token-embedding-type] [--attention-qkv-type] [--attention-q-type] [--attention-k-type] [--attention-v-type] [--attention-qa-type]\n");
+    printf("       [--attention-qb-type] [--attention-kva-type] [--attention-kvb-type] [--attention-output-type] [--feedforward-up-type] [--feedforward-gate-type]\n");
+    printf("       [--feedforward-down-type] [--feedforward-gate-exp-type] [--feedforward-down-exp-type] [--feedforward-up-exp-type] [--feedforward-gate-shexp-type]\n");
+    printf("       [--feedforward-down-shexp-type] [--feedforward-up-shexp-type] [--classifier-type] [--classifier-output-type] [--override-kv]\n");
+    printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
@@ -114,14 +119,26 @@ static void usage(const char * executable) {
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
     printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
+    printf("  --attention-qkv-type ggml_type: use this ggml_type for the attn_qkv.weight tensor\n");
     printf("  --attention-q-type ggml_type: use this ggml_type for the attn_q.weight tensor\n");
     printf("  --attention-k-type ggml_type: use this ggml_type for the attn_k.weight tensor\n");
     printf("  --attention-v-type ggml_type: use this ggml_type for the attn_v.weight tensor\n");
-    printf("  --attention-qkv-type ggml_type: use this ggml_type for the attn_qkv.weight tensor\n");
+    printf("  --attention-qa-type ggml_type: use this ggml_type for the attn_q_a.weight tensor\n");
+    printf("  --attention-qb-type ggml_type: use this ggml_type for the attn_q_b.weight tensor\n");
+    printf("  --attention-kva-type ggml_type: use this ggml_type for the attn_kv_a_mqa.weight tensor\n");
+    printf("  --attention-kvb-type ggml_type: use this ggml_type for the attn_kv_b.weight tensor\n");
     printf("  --attention-output-type ggml_type: use this ggml_type for the attn_output.weight tensor\n");
     printf("  --feedforward-up-type ggml_type: use this ggml_type for the ffn_up.weight tensor\n");
     printf("  --feedforward-gate-type ggml_type: use this ggml_type for the ffn_gate.weight tensor\n");
     printf("  --feedforward-down-type ggml_type: use this ggml_type for the ffn_down.weight tensor\n");
+    printf("  --feedforward-up-exp-type ggml_type: use this ggml_type for the ffn_up_exp.weight tensor\n");
+    printf("  --feedforward-gate-exp-type ggml_type: use this ggml_type for the ffn_gate_exp.weight tensor\n");
+    printf("  --feedforward-down-exp-type ggml_type: use this ggml_type for the ffn_down_exp.weight tensor\n");
+    printf("  --feedforward-up-shexp-type ggml_type: use this ggml_type for the ffn_up_shexp.weight tensor\n");
+    printf("  --feedforward-gate-shexp-type ggml_type: use this ggml_type for the ffn_gate_shexp.weight tensor\n");
+    printf("  --feedforward-down-shexp-type ggml_type: use this ggml_type for the ffn_down_shexp.weight tensor\n");
+    printf("  --classifier-type ggml_type: use this ggml_type for the cls.weight tensor\n");
+    printf("  --classifier-output-type ggml_type: use this ggml_type for the cls.output.weight tensor\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");

From 4e5c96a3e0624961f9c59ba1dac9b243a780011e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 19 Mar 2025 22:43:25 +0000
Subject: [PATCH 16/26] Add new parameters defaults

---
 src/llama-quant.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b49b02aca7863..4d9b813f362de 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -939,14 +939,26 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
         /*.keep_split                  =*/ false,
         /*.imatrix                     =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
+        /*.attn_qkv_tensor_type        =*/ GGML_TYPE_COUNT,
         /*.attn_q_tensor_type          =*/ GGML_TYPE_COUNT,
         /*.attn_k_tensor_type          =*/ GGML_TYPE_COUNT,
         /*.attn_v_tensor_type          =*/ GGML_TYPE_COUNT,
-        /*.attn_qkv_tensor_type        =*/ GGML_TYPE_COUNT,
+        /*.attn_qa_tensor_type         =*/ GGML_TYPE_COUNT,
+        /*.attn_qb_tensor_type         =*/ GGML_TYPE_COUNT,
+        /*.attn_kva_tensor_type        =*/ GGML_TYPE_COUNT,
+        /*.attn_kvb_tensor_type        =*/ GGML_TYPE_COUNT,
         /*.attn_output_tensor_type     =*/ GGML_TYPE_COUNT,
         /*.ffn_up_tensor_type          =*/ GGML_TYPE_COUNT,
         /*.ffn_gate_tensor_type        =*/ GGML_TYPE_COUNT,
         /*.ffn_down_tensor_type        =*/ GGML_TYPE_COUNT,
+        /*.ffn_up_exp_tensor_type      =*/ GGML_TYPE_COUNT,
+        /*.ffn_gate_exp_tensor_type    =*/ GGML_TYPE_COUNT,
+        /*.ffn_down_exp_tensor_type    =*/ GGML_TYPE_COUNT,
+        /*.ffn_up_shexp_tensor_type    =*/ GGML_TYPE_COUNT,
+        /*.ffn_gate_shexp_tensor_type  =*/ GGML_TYPE_COUNT,
+        /*.ffn_down_shexp_tensor_type  =*/ GGML_TYPE_COUNT,
+        /*.cls_tensor_type             =*/ GGML_TYPE_COUNT,
+        /*.cls_output_tensor_type      =*/ GGML_TYPE_COUNT,
     };
 
     return result;

From 9b3ccb535e0acbc2d34b1f91a4f06efdcfd6270e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 19 Mar 2025 22:43:56 +0000
Subject: [PATCH 17/26] Add new quantization parameters logic

---
 src/llama-quant.cpp | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4d9b813f362de..467240c696f92 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -789,14 +789,22 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                     new_type = params->token_embedding_type;
                 } else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
                     new_type = params->output_tensor_type;
+                } else if (params->attn_qkv_tensor_type < GGML_TYPE_COUNT && name.find("attn_kqv.weight") != std::string::npos) {
+                    new_type = params->attn_qkv_tensor_type;
                 } else if (params->attn_q_tensor_type < GGML_TYPE_COUNT && name.find("attn_q.weight") != std::string::npos) {
                     new_type = params->attn_q_tensor_type;
                 } else if (params->attn_k_tensor_type < GGML_TYPE_COUNT && name.find("attn_k.weight") != std::string::npos) {
                     new_type = params->attn_k_tensor_type;
                 } else if (params->attn_v_tensor_type < GGML_TYPE_COUNT && name.find("attn_v.weight") != std::string::npos) {
                     new_type = params->attn_v_tensor_type;
-                } else if (params->attn_qkv_tensor_type < GGML_TYPE_COUNT && name.find("attn_kqv.weight") != std::string::npos) {
-                    new_type = params->attn_qkv_tensor_type;
+                } else if (params->attn_qa_tensor_type < GGML_TYPE_COUNT && name.find("attn_q_a.weight") != std::string::npos) {
+                    new_type = params->attn_qa_tensor_type;
+                } else if (params->attn_qb_tensor_type < GGML_TYPE_COUNT && name.find("attn_q_b_mqa.weight") != std::string::npos) {
+                    new_type = params->attn_qb_tensor_type;
+                } else if (params->attn_kva_tensor_type < GGML_TYPE_COUNT && name.find("attn_kv_a_mqa.weight") != std::string::npos) {
+                    new_type = params->attn_kva_tensor_type;
+                } else if (params->attn_kvb_tensor_type < GGML_TYPE_COUNT && name.find("attn_kv_b.weight") != std::string::npos) {
+                    new_type = params->attn_kvb_tensor_type;
                 } else if (params->attn_output_tensor_type < GGML_TYPE_COUNT && name.find("attn_output.weight") != std::string::npos) {
                     new_type = params->attn_output_tensor_type;
                 } else if (params->ffn_up_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up.weight") != std::string::npos) {
@@ -805,6 +813,22 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                     new_type = params->ffn_gate_tensor_type;
                 } else if (params->ffn_down_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down.weight") != std::string::npos) {
                     new_type = params->ffn_down_tensor_type;
+                } else if (params->ffn_up_exp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up_exps.weight") != std::string::npos) {
+                    new_type = params->ffn_up_exp_tensor_type;
+                } else if (params->ffn_gate_exp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_gate_exps.weight") != std::string::npos) {
+                    new_type = params->ffn_gate_exp_tensor_type;
+                } else if (params->ffn_down_exp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down_exps.weight") != std::string::npos) {
+                    new_type = params->ffn_down_exp_tensor_type;
+                } else if (params->ffn_up_shexp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up_shexp.weight") != std::string::npos) {
+                    new_type = params->ffn_up_shexp_tensor_type;
+                } else if (params->ffn_gate_shexp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_gate_shexp.weight") != std::string::npos) {
+                    new_type = params->ffn_gate_shexp_tensor_type;
+                } else if (params->ffn_down_shexp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down_shexp.weight") != std::string::npos) {
+                    new_type = params->ffn_down_shexp_tensor_type;
+                } else if (params->cls_tensor_type < GGML_TYPE_COUNT && name.find("cls.weight") != std::string::npos) {
+                    new_type = params->cls_tensor_type;
+                } else if (params->cls_output_tensor_type < GGML_TYPE_COUNT && name.find("cls.output.weight") != std::string::npos) {
+                    new_type = params->cls_output_tensor_type;
                 } else {
                     new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
                 }

From 35f45f19d1fe31e2e7ae70a2b762c54b9122e209 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 19 Mar 2025 23:10:04 +0000
Subject: [PATCH 18/26] Minor refactoring as per the contributors' guidelines

---
 examples/quantize/quantize.cpp |  3 +--
 src/llama-quant.cpp            | 16 +++++++---------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 5ad9e9502e590..a87cfd13b24c7 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -7,7 +7,6 @@
 #include <string>
 #include <unordered_map>
 #include <fstream>
-#include <cmath>
 #include <cctype>
 
 struct quant_option {
@@ -16,7 +15,7 @@ struct quant_option {
     std::string desc;
 };
 
-static const std::vector<struct quant_option> QUANT_OPTIONS = {
+static const std::vector<quant_option> QUANT_OPTIONS = {
     { "Q4_0",     LLAMA_FTYPE_MOSTLY_Q4_0,     " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
     { "Q4_1",     LLAMA_FTYPE_MOSTLY_Q4_1,     " 4.78G, +0.4511 ppl @ Llama-3-8B",  },
     { "Q5_0",     LLAMA_FTYPE_MOSTLY_Q5_0,     " 5.21G, +0.1316 ppl @ Llama-3-8B",  },
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 467240c696f92..bfe2e99f169c5 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -5,11 +5,9 @@
 #include "llama-model-loader.h"
 
 #include <algorithm>
-#include <cmath>
 #include <cstring>
 #include <cinttypes>
 #include <fstream>
-#include <mutex>
 #include <thread>
 #include <unordered_map>
 
@@ -48,7 +46,7 @@ struct quantize_state_impl {
 };
 
 static void llama_tensor_dequantize_impl(
-    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
+    ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
     const size_t nelements, const int nthread
 ) {
     if (output.size() < nelements) {
@@ -536,7 +534,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     model.load_hparams(ml);
     model.load_stats  (ml);
 
-    struct quantize_state_impl qs(model, params);
+    quantize_state_impl qs(model, params);
 
     if (params->only_copy) {
         ftype = ml.ftype;
@@ -661,7 +659,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     // populate the original tensors so we get an initial meta data
     for (const auto * it : tensors) {
         uint16_t i_split = params->keep_split ? it->idx : 0;
-        struct ggml_tensor * tensor = it->tensor;
+        ggml_tensor * tensor = it->tensor;
         if (!ctx_outs[i_split]) {
             ctx_outs[i_split].reset(gguf_init_empty());
         }
@@ -710,7 +708,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     new_ofstream(0);
     for (const auto * it : tensors) {
         const auto & weight = *it;
-        struct ggml_tensor * tensor = weight.tensor;
+        ggml_tensor * tensor = weight.tensor;
         if (weight.idx != cur_split && params->keep_split) {
             close_ofstream();
             new_ofstream(weight.idx);
@@ -776,7 +774,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         // do not quantize relative position bias (T5)
         quantize &= name.find("attn_rel_b.weight") == std::string::npos;
 
-        enum ggml_type new_type;
+        ggml_type new_type;
         void * new_data;
         size_t new_size;
 
@@ -950,8 +948,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 // interface implementation
 //
 
-struct llama_model_quantize_params llama_model_quantize_default_params() {
-    struct llama_model_quantize_params result = {
+llama_model_quantize_params llama_model_quantize_default_params() {
+    llama_model_quantize_params result = {
         /*.nthread                     =*/ 0,
         /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
         /*.output_tensor_type          =*/ GGML_TYPE_COUNT,

From 54e13cf69919180816f2e901037124c06c2d4b08 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 29 Mar 2025 12:18:59 +0000
Subject: [PATCH 19/26] Implement general --tensor-type instead of
 tensor-specific command option

---
 examples/quantize/quantize.cpp | 301 +++++++++++----------------------
 include/llama.h                |  22 +--
 src/llama-quant.cpp            |  89 +++-------
 3 files changed, 122 insertions(+), 290 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index a87cfd13b24c7..88c2167b38a20 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -105,11 +105,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 [[noreturn]]
 static void usage(const char * executable) {
     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type]\n", executable);
-    printf("       [--token-embedding-type] [--attention-qkv-type] [--attention-q-type] [--attention-k-type] [--attention-v-type] [--attention-qa-type]\n");
-    printf("       [--attention-qb-type] [--attention-kva-type] [--attention-kvb-type] [--attention-output-type] [--feedforward-up-type] [--feedforward-gate-type]\n");
-    printf("       [--feedforward-down-type] [--feedforward-gate-exp-type] [--feedforward-down-exp-type] [--feedforward-up-exp-type] [--feedforward-gate-shexp-type]\n");
-    printf("       [--feedforward-down-shexp-type] [--feedforward-up-shexp-type] [--classifier-type] [--classifier-output-type] [--override-kv]\n");
-    printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
+    printf("       [--token-embedding-type] [--tensor-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
@@ -118,26 +114,8 @@ static void usage(const char * executable) {
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
     printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
-    printf("  --attention-qkv-type ggml_type: use this ggml_type for the attn_qkv.weight tensor\n");
-    printf("  --attention-q-type ggml_type: use this ggml_type for the attn_q.weight tensor\n");
-    printf("  --attention-k-type ggml_type: use this ggml_type for the attn_k.weight tensor\n");
-    printf("  --attention-v-type ggml_type: use this ggml_type for the attn_v.weight tensor\n");
-    printf("  --attention-qa-type ggml_type: use this ggml_type for the attn_q_a.weight tensor\n");
-    printf("  --attention-qb-type ggml_type: use this ggml_type for the attn_q_b.weight tensor\n");
-    printf("  --attention-kva-type ggml_type: use this ggml_type for the attn_kv_a_mqa.weight tensor\n");
-    printf("  --attention-kvb-type ggml_type: use this ggml_type for the attn_kv_b.weight tensor\n");
-    printf("  --attention-output-type ggml_type: use this ggml_type for the attn_output.weight tensor\n");
-    printf("  --feedforward-up-type ggml_type: use this ggml_type for the ffn_up.weight tensor\n");
-    printf("  --feedforward-gate-type ggml_type: use this ggml_type for the ffn_gate.weight tensor\n");
-    printf("  --feedforward-down-type ggml_type: use this ggml_type for the ffn_down.weight tensor\n");
-    printf("  --feedforward-up-exp-type ggml_type: use this ggml_type for the ffn_up_exp.weight tensor\n");
-    printf("  --feedforward-gate-exp-type ggml_type: use this ggml_type for the ffn_gate_exp.weight tensor\n");
-    printf("  --feedforward-down-exp-type ggml_type: use this ggml_type for the ffn_down_exp.weight tensor\n");
-    printf("  --feedforward-up-shexp-type ggml_type: use this ggml_type for the ffn_up_shexp.weight tensor\n");
-    printf("  --feedforward-gate-shexp-type ggml_type: use this ggml_type for the ffn_gate_shexp.weight tensor\n");
-    printf("  --feedforward-down-shexp-type ggml_type: use this ggml_type for the ffn_down_shexp.weight tensor\n");
-    printf("  --classifier-type ggml_type: use this ggml_type for the cls.weight tensor\n");
-    printf("  --classifier-output-type ggml_type: use this ggml_type for the cls.output.weight tensor\n");
+    printf("  --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
+    printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -268,6 +246,95 @@ static ggml_type parse_ggml_type(const char * arg) {
     return GGML_TYPE_COUNT;
 }
 
+// Allowed tensors for arbitrary quantization with --tensor-type option
+static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
+    "attn_k",
+    "attn_kv_a_mqa",
+    "attn_kv_b",
+    "attn_out",
+    "attn_q_a",
+    "attn_q_b",
+    "attn_q",
+    "attn_qkv",
+    "attn_v",
+    "channel_mix_key",
+    "channel_mix_receptance",
+    "channel_mix_value",
+    "cls_out",
+    "cls",
+    "dec_attn_k",
+    "dec_attn_out",
+    "dec_attn_q",
+    "dec_attn_v",
+    "dec_cross_attn_k",
+    "dec_cross_attn_out",
+    "dec_cross_attn_q",
+    "dec_cross_attn_v",
+    "ffn_act",
+    "ffn_down_exp",
+    "ffn_down_shexp",
+    "ffn_down",
+    "ffn_gate_exp",
+    "ffn_gate_shexp",
+    "ffn_gate",
+    "ffn_up_exp",
+    "ffn_up_shexp",
+    "ffn_up",
+    "ssm_in",
+    "ssm_out",
+    "time_mix_gate",
+    "time_mix_key",
+    "time_mix_output",
+    "time_mix_receptance",
+    "time_mix_value",
+};
+
+// changes to this struct must be replicated in llama-quant.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
+static bool string_parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
+    const char * sep = strchr(data, '=');
+    if (sep == nullptr) {
+        printf("\n%s: malformed tensor type '%s'\n\n", __func__, data);
+        return false;
+    }
+
+    const size_t tn_len = sep - data;
+    if (tn_len == 0) {
+        printf("\n%s: missing tensor name\n\n", __func__);
+        return false;
+    }
+
+    if (const size_t qt_len = strlen(sep); qt_len == 1) {
+        printf("\n%s: missing quantization type\n\n", __func__);
+        return false;
+    }
+
+    std::string tn(data, tn_len);
+    std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
+    sep++;
+    const std::string qt(sep);
+
+    if (find(ALLOWED_TENSOR_TYPE.begin(), ALLOWED_TENSOR_TYPE.end(), tn) == ALLOWED_TENSOR_TYPE.end()) {
+        printf("\n%s: invalid tensor name '%s'\n\n", __func__, tn.c_str());
+        return false;
+    }
+
+    if (parse_ggml_type(qt.c_str()) == GGML_TYPE_COUNT) {
+        printf("\n%s: invalid quantization type '%s'\n\n", __func__, qt.c_str());
+        return false;
+    }
+
+    tensor_quantization tqz;
+    tqz.name = tn;
+    tqz.quant = parse_ggml_type(qt.c_str());
+    tensor_type.emplace_back(std::move(tqz));
+    return true;
+}
+
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);
@@ -279,6 +346,7 @@ int main(int argc, char ** argv) {
     std::string imatrix_file;
     std::vector<std::string> included_weights, excluded_weights;
     std::vector<llama_model_kv_override> kv_overrides;
+    std::vector<tensor_quantization> tensor_types;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -301,184 +369,8 @@ int main(int argc, char ** argv) {
             } else {
                 usage(argv[0]);
             }
-        } else if (strcmp(argv[arg_idx], "--attention-qkv-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.attn_qkv_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.attn_qkv_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--attention-q-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.attn_q_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.attn_q_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--attention-k-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.attn_k_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.attn_k_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--attention-v-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.attn_v_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.attn_v_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--attention-qa-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.attn_qa_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.attn_qa_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--attention-qb-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.attn_qb_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.attn_qb_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--attention-kva-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.attn_kva_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.attn_kva_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--attention-kvb-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.attn_kvb_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.attn_kvb_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--attention-output-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.attn_output_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.attn_output_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--feedforward-up-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.ffn_up_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.ffn_up_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--feedforward-gate-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.ffn_gate_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.ffn_gate_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--feedforward-down-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.ffn_down_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.ffn_down_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--feedforward-up-exp-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.ffn_up_exp_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.ffn_up_exp_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--feedforward-gate-exp-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.ffn_gate_exp_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.ffn_gate_exp_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--feedforward-down-exp-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.ffn_down_exp_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.ffn_down_exp_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--feedforward-up-shexp_type") == 0) {
-            if (arg_idx < argc-1) {
-                params.ffn_up_shexp_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.ffn_up_shexp_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--feedforward-gate-shexp-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.ffn_gate_shexp_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.ffn_gate_shexp_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--feedforward-down-shexp-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.ffn_down_shexp_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.ffn_down_shexp_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--classifier-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.cls_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.cls_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--classifier-output-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.cls_output_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.cls_output_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
-            } else {
+        } else if (strcmp(argv[arg_idx], "--tensor-type") == 0) {
+            if (arg_idx == argc-1 || !string_parse_tensor_type(argv[++arg_idx], tensor_types)) {
                 usage(argv[0]);
             }
         } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
@@ -565,6 +457,9 @@ int main(int argc, char ** argv) {
         kv_overrides.back().key[0] = 0;
         params.kv_overrides = &kv_overrides;
     }
+    if (!tensor_types.empty()) {
+        params.tensor_types = &tensor_types;
+    }
 
     llama_backend_init();
 
diff --git a/include/llama.h b/include/llama.h
index 76fb75e85d126..261c6a4cd947c 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -366,27 +366,7 @@ extern "C" {
         bool keep_split;                      // quantize to the same number of shards
         void * imatrix;                       // pointer to importance matrix data
         void * kv_overrides;                  // pointer to vector containing overrides
-        ggml_type attn_qkv_tensor_type;       // attention query/key/value tensor type
-        ggml_type attn_q_tensor_type;         // attention query tensor type
-        ggml_type attn_k_tensor_type;         // attention key tensor type
-        ggml_type attn_v_tensor_type;         // attention value tensor type
-        ggml_type attn_qa_tensor_type;        // attention query a tensor type
-        ggml_type attn_qb_tensor_type;        // attention query b tensor type
-        ggml_type attn_kva_tensor_type;       // attention key/value a tensor type
-        ggml_type attn_kvb_tensor_type;       // attention key/value b tensor type
-        ggml_type attn_output_tensor_type;    // attention output tensor type
-        ggml_type ffn_up_tensor_type;         // feedforward up tensor type
-        ggml_type ffn_gate_tensor_type;       // feedforward gate tensor type
-        ggml_type ffn_down_tensor_type;       // feedforward down tensor type
-        ggml_type ffn_up_exp_tensor_type;     // feedforward up expert tensor type
-        ggml_type ffn_gate_exp_tensor_type;   // feedforward gate expert tensor type
-        ggml_type ffn_down_exp_tensor_type;   // feedforward down expert tensor type
-        ggml_type ffn_up_shexp_tensor_type;   // feedforward up shared expert tensor type
-        ggml_type ffn_gate_shexp_tensor_type; // feedforward gate shared expert tensor type
-        ggml_type ffn_down_shexp_tensor_type; // feedforward down shared expert tensor type
-        ggml_type cls_tensor_type;            // classifier tensor type
-        ggml_type cls_output_tensor_type;     // classifier output tensor type
-
+        void * tensor_types;                  // pointer to vector containing tensor types
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index bfe2e99f169c5..65c20904ba1a9 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -45,6 +45,12 @@ struct quantize_state_impl {
         {}
 };
 
+// changes to this struct must be replicated in quantize.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
 static void llama_tensor_dequantize_impl(
     ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
     const size_t nelements, const int nthread
@@ -783,54 +789,24 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
             // get more optimal quantization type based on the tensor shape, layer, etc.
             if (!params->pure && ggml_is_quantized(default_type)) {
-                if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
-                    new_type = params->token_embedding_type;
-                } else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
-                    new_type = params->output_tensor_type;
-                } else if (params->attn_qkv_tensor_type < GGML_TYPE_COUNT && name.find("attn_kqv.weight") != std::string::npos) {
-                    new_type = params->attn_qkv_tensor_type;
-                } else if (params->attn_q_tensor_type < GGML_TYPE_COUNT && name.find("attn_q.weight") != std::string::npos) {
-                    new_type = params->attn_q_tensor_type;
-                } else if (params->attn_k_tensor_type < GGML_TYPE_COUNT && name.find("attn_k.weight") != std::string::npos) {
-                    new_type = params->attn_k_tensor_type;
-                } else if (params->attn_v_tensor_type < GGML_TYPE_COUNT && name.find("attn_v.weight") != std::string::npos) {
-                    new_type = params->attn_v_tensor_type;
-                } else if (params->attn_qa_tensor_type < GGML_TYPE_COUNT && name.find("attn_q_a.weight") != std::string::npos) {
-                    new_type = params->attn_qa_tensor_type;
-                } else if (params->attn_qb_tensor_type < GGML_TYPE_COUNT && name.find("attn_q_b_mqa.weight") != std::string::npos) {
-                    new_type = params->attn_qb_tensor_type;
-                } else if (params->attn_kva_tensor_type < GGML_TYPE_COUNT && name.find("attn_kv_a_mqa.weight") != std::string::npos) {
-                    new_type = params->attn_kva_tensor_type;
-                } else if (params->attn_kvb_tensor_type < GGML_TYPE_COUNT && name.find("attn_kv_b.weight") != std::string::npos) {
-                    new_type = params->attn_kvb_tensor_type;
-                } else if (params->attn_output_tensor_type < GGML_TYPE_COUNT && name.find("attn_output.weight") != std::string::npos) {
-                    new_type = params->attn_output_tensor_type;
-                } else if (params->ffn_up_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up.weight") != std::string::npos) {
-                    new_type = params->ffn_up_tensor_type;
-                } else if (params->ffn_gate_tensor_type < GGML_TYPE_COUNT && name.find("ffn_gate.weight") != std::string::npos) {
-                    new_type = params->ffn_gate_tensor_type;
-                } else if (params->ffn_down_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down.weight") != std::string::npos) {
-                    new_type = params->ffn_down_tensor_type;
-                } else if (params->ffn_up_exp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up_exps.weight") != std::string::npos) {
-                    new_type = params->ffn_up_exp_tensor_type;
-                } else if (params->ffn_gate_exp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_gate_exps.weight") != std::string::npos) {
-                    new_type = params->ffn_gate_exp_tensor_type;
-                } else if (params->ffn_down_exp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down_exps.weight") != std::string::npos) {
-                    new_type = params->ffn_down_exp_tensor_type;
-                } else if (params->ffn_up_shexp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_up_shexp.weight") != std::string::npos) {
-                    new_type = params->ffn_up_shexp_tensor_type;
-                } else if (params->ffn_gate_shexp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_gate_shexp.weight") != std::string::npos) {
-                    new_type = params->ffn_gate_shexp_tensor_type;
-                } else if (params->ffn_down_shexp_tensor_type < GGML_TYPE_COUNT && name.find("ffn_down_shexp.weight") != std::string::npos) {
-                    new_type = params->ffn_down_shexp_tensor_type;
-                } else if (params->cls_tensor_type < GGML_TYPE_COUNT && name.find("cls.weight") != std::string::npos) {
-                    new_type = params->cls_tensor_type;
-                } else if (params->cls_output_tensor_type < GGML_TYPE_COUNT && name.find("cls.output.weight") != std::string::npos) {
-                    new_type = params->cls_output_tensor_type;
-                } else {
-                    new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+                // unless the user specifies a type
+                if (params->tensor_types) {
+                    const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
+                    for (const auto & [name, quant] : tensor_types) {
+                        if (std::string str(tensor->name); str.find(name) != std::string::npos) {
+                            new_type = quant;
+                            break;
+                        }
+                    }
                 }
             }
+            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
+                new_type = params->token_embedding_type;
+            }
+            if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
+                new_type = params->output_tensor_type;
+            }
 
             // If we've decided to quantize to the same type the tensor is already
             // in then there's nothing to do.
@@ -961,26 +937,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.keep_split                  =*/ false,
         /*.imatrix                     =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
-        /*.attn_qkv_tensor_type        =*/ GGML_TYPE_COUNT,
-        /*.attn_q_tensor_type          =*/ GGML_TYPE_COUNT,
-        /*.attn_k_tensor_type          =*/ GGML_TYPE_COUNT,
-        /*.attn_v_tensor_type          =*/ GGML_TYPE_COUNT,
-        /*.attn_qa_tensor_type         =*/ GGML_TYPE_COUNT,
-        /*.attn_qb_tensor_type         =*/ GGML_TYPE_COUNT,
-        /*.attn_kva_tensor_type        =*/ GGML_TYPE_COUNT,
-        /*.attn_kvb_tensor_type        =*/ GGML_TYPE_COUNT,
-        /*.attn_output_tensor_type     =*/ GGML_TYPE_COUNT,
-        /*.ffn_up_tensor_type          =*/ GGML_TYPE_COUNT,
-        /*.ffn_gate_tensor_type        =*/ GGML_TYPE_COUNT,
-        /*.ffn_down_tensor_type        =*/ GGML_TYPE_COUNT,
-        /*.ffn_up_exp_tensor_type      =*/ GGML_TYPE_COUNT,
-        /*.ffn_gate_exp_tensor_type    =*/ GGML_TYPE_COUNT,
-        /*.ffn_down_exp_tensor_type    =*/ GGML_TYPE_COUNT,
-        /*.ffn_up_shexp_tensor_type    =*/ GGML_TYPE_COUNT,
-        /*.ffn_gate_shexp_tensor_type  =*/ GGML_TYPE_COUNT,
-        /*.ffn_down_shexp_tensor_type  =*/ GGML_TYPE_COUNT,
-        /*.cls_tensor_type             =*/ GGML_TYPE_COUNT,
-        /*.cls_output_tensor_type      =*/ GGML_TYPE_COUNT,
+        /*.tensor_type                 =*/ nullptr,
     };
 
     return result;

From b3c7db572c44867746249ecb49527ae9b49f24f7 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 30 Mar 2025 07:47:13 +0100
Subject: [PATCH 20/26] Fix implied type bug

---
 include/llama.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index d226b5a857e94..e2ca8081e7b04 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -356,10 +356,10 @@ extern "C" {
 
     // model quantization parameters
     typedef struct llama_model_quantize_params {
-        int32_t nthread;                            // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        llama_ftype ftype;                    // quantize to this llama_ftype
-        ggml_type output_tensor_type;         // output tensor type
-        ggml_type token_embedding_type;       // token embeddings tensor type
+        int32_t nthread;                      // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype ftype;               // quantize to this llama_ftype
+        enum ggml_type output_tensor_type;    // output tensor type
+        enum ggml_type token_embedding_type;  // token embeddings tensor type
         bool allow_requantize;                // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor;          // quantize output.weight
         bool only_copy;                       // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored

From 625f0ae57b91ea117e5fdb450d644cf5a2c0eb42 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 31 Mar 2025 07:22:02 +0100
Subject: [PATCH 21/26] Restore missing #includes

---
 examples/quantize/quantize.cpp | 1 +
 src/llama-quant.cpp            | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 88c2167b38a20..04e5ce4ef4d60 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -7,6 +7,7 @@
 #include <string>
 #include <unordered_map>
 #include <fstream>
+#include <cmath>
 #include <cctype>
 
 struct quant_option {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 65c20904ba1a9..eb3863e0dacfe 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -5,9 +5,11 @@
 #include "llama-model-loader.h"
 
 #include <algorithm>
+#include <cmath>
 #include <cstring>
 #include <cinttypes>
 #include <fstream>
+#include <mutex>
 #include <thread>
 #include <unordered_map>
 

From 2fd0b41f50af5323849f4ed79c34bef69825ddd3 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 1 Apr 2025 22:20:04 +0100
Subject: [PATCH 22/26] Add regex capability for tensor selection

---
 examples/quantize/quantize.cpp | 10 +++++++++-
 src/llama-quant.cpp            |  8 +++++---
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 04e5ce4ef4d60..f642e6c1ba55d 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -319,7 +319,15 @@ static bool string_parse_tensor_type(const char * data, std::vector<tensor_quant
     sep++;
     const std::string qt(sep);
 
-    if (find(ALLOWED_TENSOR_TYPE.begin(), ALLOWED_TENSOR_TYPE.end(), tn) == ALLOWED_TENSOR_TYPE.end()) {
+    bool found = false;
+    for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
+        // check if an allowed tensor exists and it's at the end of the kv string
+        if (tn.length() - allowed.length() == tn.find(allowed)) {
+            found = true;
+            break;
+        }
+    }
+    if (!found) {
         printf("\n%s: invalid tensor name '%s'\n\n", __func__, tn.c_str());
         return false;
     }
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index eb3863e0dacfe..68d9c5bec53d5 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -10,6 +10,7 @@
 #include <cinttypes>
 #include <fstream>
 #include <mutex>
+#include <regex>
 #include <thread>
 #include <unordered_map>
 
@@ -795,9 +796,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                 // unless the user specifies a type
                 if (params->tensor_types) {
                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
-                    for (const auto & [name, quant] : tensor_types) {
-                        if (std::string str(tensor->name); str.find(name) != std::string::npos) {
-                            new_type = quant;
+                    for (const auto & [tname, qtype] : tensor_types) {
+                        if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
+                            LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
+                            new_type = qtype;
                             break;
                         }
                     }

From 054ede4eb1448c199bb98df804e16e6c19144f06 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 3 Apr 2025 08:07:49 +0100
Subject: [PATCH 23/26] Refactor function name and update ALLOWED_TENSOR_TYPE

---
 examples/quantize/quantize.cpp | 37 ++++++++++++++++------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index f642e6c1ba55d..ffdcc1092d4be 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -252,35 +252,32 @@ static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
     "attn_k",
     "attn_kv_a_mqa",
     "attn_kv_b",
-    "attn_out",
+    "attn_o",
+    "attn_output",
+    "attn_q",
     "attn_q_a",
     "attn_q_b",
-    "attn_q",
     "attn_qkv",
     "attn_v",
     "channel_mix_key",
     "channel_mix_receptance",
     "channel_mix_value",
-    "cls_out",
     "cls",
-    "dec_attn_k",
-    "dec_attn_out",
-    "dec_attn_q",
-    "dec_attn_v",
-    "dec_cross_attn_k",
-    "dec_cross_attn_out",
-    "dec_cross_attn_q",
-    "dec_cross_attn_v",
+    "cls.output",
+    "cross_attn_k",
+    "cross_attn_o",
+    "cross_attn_q",
+    "cross_attn_v",
     "ffn_act",
-    "ffn_down_exp",
-    "ffn_down_shexp",
     "ffn_down",
-    "ffn_gate_exp",
-    "ffn_gate_shexp",
+    "ffn_down_exps",
+    "ffn_down_shexp",
     "ffn_gate",
-    "ffn_up_exp",
-    "ffn_up_shexp",
+    "ffn_gate_exps",
+    "ffn_gate_shexp",
     "ffn_up",
+    "ffn_up_exps",
+    "ffn_up_shexp",
     "ssm_in",
     "ssm_out",
     "time_mix_gate",
@@ -296,7 +293,7 @@ struct tensor_quantization {
     ggml_type quant = GGML_TYPE_COUNT;
 };
 
-static bool string_parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
+static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
     const char * sep = strchr(data, '=');
     if (sep == nullptr) {
         printf("\n%s: malformed tensor type '%s'\n\n", __func__, data);
@@ -322,7 +319,7 @@ static bool string_parse_tensor_type(const char * data, std::vector<tensor_quant
     bool found = false;
     for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
         // check if an allowed tensor exists and it's at the end of the kv string
-        if (tn.length() - allowed.length() == tn.find(allowed)) {
+        if (tn.length() - allowed.length() == tn.find(allowed) && tn == allowed) {
             found = true;
             break;
         }
@@ -379,7 +376,7 @@ int main(int argc, char ** argv) {
                 usage(argv[0]);
             }
         } else if (strcmp(argv[arg_idx], "--tensor-type") == 0) {
-            if (arg_idx == argc-1 || !string_parse_tensor_type(argv[++arg_idx], tensor_types)) {
+            if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
                 usage(argv[0]);
             }
         } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {

From 5a304b8e26b8c53f43e8d12515e52f9bb7d199f0 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 3 Apr 2025 08:14:09 +0100
Subject: [PATCH 24/26] Add missing #include

---
 examples/quantize/quantize.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index ffdcc1092d4be..da6a1a467c467 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -9,6 +9,7 @@
 #include <fstream>
 #include <cmath>
 #include <cctype>
+#include <algorithm>
 
 struct quant_option {
     std::string name;

From 1acb9f4a841405cbda04e03b4fb8bc039b8e5792 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 3 Apr 2025 23:32:02 +0100
Subject: [PATCH 25/26] Handle edge case when tensor name is cls.output

---
 examples/quantize/quantize.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index da6a1a467c467..0355311dc5c06 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -319,8 +319,15 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
 
     bool found = false;
     for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
+        std::string tensor;
+        tensor = tn.rfind('.') != std::string::npos ? tn.substr(tn.rfind('.') + 1) : tn;
+        // handle special case of cls.output
+        std::string cls_output = "cls.output";
+        if (tn.find(cls_output) != std::string::npos) {
+            tensor = "cls.output";
+        }
         // check if an allowed tensor exists and it's at the end of the kv string
-        if (tn.length() - allowed.length() == tn.find(allowed) && tn == allowed) {
+        if (tensor == allowed) {
             found = true;
             break;
         }

From 04604a46cdd12c6d70c3f91d02ab574105c32da4 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 7 Apr 2025 19:36:29 +0100
Subject: [PATCH 26/26] Minor logging improvement

---
 src/llama-quant.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 68d9c5bec53d5..dbac9f37f0323 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -798,7 +798,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
                     for (const auto & [tname, qtype] : tensor_types) {
                         if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
-                            LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
+                            if (qtype != new_type) {
+                                LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
+                            }
                             new_type = qtype;
                             break;
                         }