From b7f502411a15cc8555a2c62422fae2328d2ff6ed Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 29 Jun 2025 21:04:20 +0200 Subject: [PATCH 1/2] convert : correct gemma 3n conversion --- gguf-py/gguf/gguf_writer.py | 4 ++-- src/llama-quant.cpp | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index d32cd479adb17..379feb43fa7f6 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -714,8 +714,8 @@ def add_max_alibi_bias(self, bias: float) -> None: def add_clamp_kqv(self, value: float) -> None: self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value) - def add_shared_kv_layers(self, value: float) -> None: - self.add_float32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value) + def add_shared_kv_layers(self, value: int) -> None: + self.add_uint32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value) def add_sliding_window_pattern(self, value: Sequence[bool]) -> None: self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f4b5713d7dd9a..0a6a581b21f4b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -894,6 +894,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { new_type = params->token_embedding_type; } + if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "per_layer_token_embd.weight") == 0) { + new_type = params->token_embedding_type; + } if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { new_type = params->output_tensor_type; } From 6692656dc5d8f5c699f1a775bdb232b433421b1d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 29 Jun 2025 22:38:44 +0200 Subject: [PATCH 2/2] rm redundant code --- src/llama-quant.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 0a6a581b21f4b..f4b5713d7dd9a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -894,9 +894,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { new_type = params->token_embedding_type; } - if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "per_layer_token_embd.weight") == 0) { - new_type = params->token_embedding_type; - } if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { new_type = params->output_tensor_type; }