From f470b268afbbc7d14458c5df701a2e0aaea7da58 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Thu, 1 Aug 2024 14:20:39 +0000
Subject: [PATCH 01/56] gptq_marlin compat dynamic_bits quantize config

---
 .../layers/quantization/gptq_marlin.py        | 46 +++++++++++++------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index bdcc9c3b4f0c..4568c8364f90 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any, Dict, List, Optional
 
 import torch
@@ -23,29 +24,45 @@ class GPTQMarlinConfig(QuantizationConfig):
     """Config class for GPTQ Marlin"""
 
     def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
-                 is_sym: bool, lm_head_quantized: bool) -> None:
+                 is_sym: bool, lm_head_quantized: bool, dynamic_bits: Dict[str, int]) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
             desc_act = False
 
-        self.weight_bits = weight_bits
-        self.pack_factor = 32 // self.weight_bits  # packed into int32
+        self.dynamic_bits = dynamic_bits
+        self._weight_bits = weight_bits
+        self._pack_factor = 32 // self._weight_bits  # packed into int32
         self.group_size = group_size
         self.desc_act = desc_act
         self.is_sym = is_sym
         self.lm_head_quantized = lm_head_quantized
 
         # Verify supported on platform.
-        verify_gptq_marlin_supported(num_bits=self.weight_bits,
+        verify_gptq_marlin_supported(num_bits=self._weight_bits,
                                      group_size=self.group_size,
                                      is_sym=self.is_sym)
 
+    def get_weight_bits(self, prefix: str):
+        real_bits = self._weight_bits
+        if len(self.dynamic_bits) > 0 and prefix:
+            remove_prefix = r'^.*?(?=\d)'
+            match_name = re.sub(remove_prefix, '', prefix)
+            for pattern, dm_bits in self.dynamic_bits.items():
+                if re.match(pattern, match_name):
+                    real_bits = dm_bits
+                    break
+        return real_bits
+
+    def get_pack_factor(self, prefix: str):
+        return 32 // self.get_weight_bits(prefix)  # packed into int32
+
     def __repr__(self) -> str:
-        return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
+        return (f"GPTQMarlinConfig(weight_bits={self._weight_bits}, "
                 f"group_size={self.group_size}, "
                 f"desc_act={self.desc_act}, "
-                f"lm_head_quantized={self.lm_head_quantized})")
+                f"lm_head_quantized={self.lm_head_quantized}), "
+                f"dynamic_bits={self.dynamic_bits}")
 
     @classmethod
     def get_name(cls) -> str:
@@ -65,6 +82,7 @@ def get_config_filenames(cls) -> List[str]:
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
+        dynamic_bits = cls.get_from_keys_or(config, ["dynamic_bits"], default={})
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
@@ -72,7 +90,7 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
                                                  default=False)
         return cls(weight_bits, group_size, desc_act, is_sym,
-                   lm_head_quantized)
+                   lm_head_quantized, dynamic_bits)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
@@ -150,6 +168,7 @@ def create_weights(
         **extra_weight_attrs,
     ) -> None:
         del output_size
+        self.prefix = extra_weight_attrs.get("prefix", "")
         output_size_per_partition = sum(output_partition_sizes)
         is_row_parallel = input_size != input_size_per_partition
 
@@ -178,11 +197,10 @@ def create_weights(
             # shard the scales in TP>1 case.
             scales_and_zp_input_dim = 0
             scales_and_zp_size = input_size_per_partition // group_size
-
         # Quantized weights
         qweight = Parameter(
             torch.empty(
-                input_size_per_partition // self.quant_config.pack_factor,
+                input_size_per_partition // self.quant_config.get_pack_factor(self.prefix),
                 output_size_per_partition,
                 dtype=torch.int32,
             ),
@@ -195,7 +213,7 @@ def create_weights(
                 "input_dim": 0,
                 "output_dim": 1,
                 "packed_dim": 0,
-                "pack_factor": self.quant_config.pack_factor,
+                "pack_factor": self.quant_config.get_pack_factor(self.prefix),
             },
         )
 
@@ -238,7 +256,7 @@ def create_weights(
         qzeros = Parameter(
             torch.empty(
                 scales_and_zp_size,
-                output_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition // self.quant_config.get_pack_factor(self.prefix),
                 dtype=torch.int32,
                 device="meta",
             ),
@@ -251,7 +269,7 @@ def create_weights(
                 "input_dim": scales_and_zp_input_dim,
                 "output_dim": 1,
                 "packed_dim": 1,
-                "pack_factor": self.quant_config.pack_factor,
+                "pack_factor": self.quant_config.get_pack_factor(self.prefix),
             },
         )
 
@@ -293,7 +311,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             perm=layer.g_idx_sort_indices,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.weight_bits)
+            num_bits=self.quant_config.get_weight_bits(self.prefix))
         replace_tensor(layer, "qweight", marlin_qweight)
 
         # Permute scales from autogptq format to marlin format.
@@ -319,7 +337,7 @@ def apply(
             g_idx=layer.g_idx,
             g_idx_sort_indices=layer.g_idx_sort_indices,
             workspace=layer.workspace,
-            num_bits=self.quant_config.weight_bits,
+            num_bits=self.quant_config.get_weight_bits(self.prefix),
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
             is_k_full=layer.is_k_full,

From 502edb36e8ed45828bf5402c398ef34f4c8ab3dc Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Fri, 2 Aug 2024 10:41:03 +0800
Subject: [PATCH 02/56] Update gptq_marlin.py

---
 .../layers/quantization/gptq_marlin.py              | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 4568c8364f90..cbcead0c7c20 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -44,15 +44,14 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
                                      is_sym=self.is_sym)
 
     def get_weight_bits(self, prefix: str):
-        real_bits = self._weight_bits
+        bits = self._weight_bits
+        # check for variable/dynamic bits
         if len(self.dynamic_bits) > 0 and prefix:
-            remove_prefix = r'^.*?(?=\d)'
-            match_name = re.sub(remove_prefix, '', prefix)
-            for pattern, dm_bits in self.dynamic_bits.items():
-                if re.match(pattern, match_name):
-                    real_bits = dm_bits
+            for pattern, dym_bits in self.dynamic_bits.items():
+                if re.match(pattern, prefix):
+                    bits = dym_bits
                     break
-        return real_bits
+        return bits
 
     def get_pack_factor(self, prefix: str):
         return 32 // self.get_weight_bits(prefix)  # packed into int32

From 18064cd0400a41375b9cdfb832e4ce3169be713d Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 2 Aug 2024 03:23:30 +0000
Subject: [PATCH 03/56] cleanup

---
 .../layers/quantization/gptq_marlin.py        | 40 ++++++++++---------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index cbcead0c7c20..91f9053c8401 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,4 +1,5 @@
 import re
+from copy import deepcopy
 from typing import Any, Dict, List, Optional
 
 import torch
@@ -31,33 +32,34 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
             desc_act = False
 
         self.dynamic_bits = dynamic_bits
-        self._weight_bits = weight_bits
-        self._pack_factor = 32 // self._weight_bits  # packed into int32
+        self.weight_bits = weight_bits
+        self.pack_factor = 32 // self.weight_bits  # packed into int32
         self.group_size = group_size
         self.desc_act = desc_act
         self.is_sym = is_sym
         self.lm_head_quantized = lm_head_quantized
 
         # Verify supported on platform.
-        verify_gptq_marlin_supported(num_bits=self._weight_bits,
+        verify_gptq_marlin_supported(num_bits=self.weight_bits,
                                      group_size=self.group_size,
                                      is_sym=self.is_sym)
 
-    def get_weight_bits(self, prefix: str):
-        bits = self._weight_bits
+    def update_bits_and_pack_factor(self, prefix: str):
+        print("lll", prefix)
+        bits = self.weight_bits
         # check for variable/dynamic bits
         if len(self.dynamic_bits) > 0 and prefix:
             for pattern, dym_bits in self.dynamic_bits.items():
+                print("re.match(pattern, prefix)",re.match(pattern, prefix), prefix)
                 if re.match(pattern, prefix):
                     bits = dym_bits
                     break
-        return bits
-
-    def get_pack_factor(self, prefix: str):
-        return 32 // self.get_weight_bits(prefix)  # packed into int32
+        if bits != self.weight_bits:
+            self.weight_bits = bits
+            self.pack_factor = 32 // self.weight_bits  # packed into int32
 
     def __repr__(self) -> str:
-        return (f"GPTQMarlinConfig(weight_bits={self._weight_bits}, "
+        return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
                 f"group_size={self.group_size}, "
                 f"desc_act={self.desc_act}, "
                 f"lm_head_quantized={self.lm_head_quantized}), "
@@ -154,7 +156,7 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
     """
 
     def __init__(self, quant_config: GPTQMarlinConfig) -> None:
-        self.quant_config = quant_config
+        self.quant_config = deepcopy(quant_config)
 
     def create_weights(
         self,
@@ -167,7 +169,9 @@ def create_weights(
         **extra_weight_attrs,
     ) -> None:
         del output_size
-        self.prefix = extra_weight_attrs.get("prefix", "")
+        prefix = extra_weight_attrs.get("prefix", "")
+        self.quant_config.update_bits_and_pack_factor(prefix=prefix)
+        print("wwww", self.quant_config.weight_bits)
         output_size_per_partition = sum(output_partition_sizes)
         is_row_parallel = input_size != input_size_per_partition
 
@@ -199,7 +203,7 @@ def create_weights(
         # Quantized weights
         qweight = Parameter(
             torch.empty(
-                input_size_per_partition // self.quant_config.get_pack_factor(self.prefix),
+                input_size_per_partition // self.quant_config.pack_factor,
                 output_size_per_partition,
                 dtype=torch.int32,
             ),
@@ -212,7 +216,7 @@ def create_weights(
                 "input_dim": 0,
                 "output_dim": 1,
                 "packed_dim": 0,
-                "pack_factor": self.quant_config.get_pack_factor(self.prefix),
+                "pack_factor": self.quant_config.pack_factor,
             },
         )
 
@@ -255,7 +259,7 @@ def create_weights(
         qzeros = Parameter(
             torch.empty(
                 scales_and_zp_size,
-                output_size_per_partition // self.quant_config.get_pack_factor(self.prefix),
+                output_size_per_partition // self.quant_config.pack_factor,
                 dtype=torch.int32,
                 device="meta",
             ),
@@ -268,7 +272,7 @@ def create_weights(
                 "input_dim": scales_and_zp_input_dim,
                 "output_dim": 1,
                 "packed_dim": 1,
-                "pack_factor": self.quant_config.get_pack_factor(self.prefix),
+                "pack_factor": self.quant_config.pack_factor,
             },
         )
 
@@ -310,7 +314,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             perm=layer.g_idx_sort_indices,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.get_weight_bits(self.prefix))
+            num_bits=self.quant_config.weight_bits)
         replace_tensor(layer, "qweight", marlin_qweight)
 
         # Permute scales from autogptq format to marlin format.
@@ -336,7 +340,7 @@ def apply(
             g_idx=layer.g_idx,
             g_idx_sort_indices=layer.g_idx_sort_indices,
             workspace=layer.workspace,
-            num_bits=self.quant_config.get_weight_bits(self.prefix),
+            num_bits=self.quant_config.weight_bits,
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
             is_k_full=layer.is_k_full,

From 1b132c379eac2e7a769f1bcc32973ee5486b70b9 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 2 Aug 2024 03:25:38 +0000
Subject: [PATCH 04/56] cleanup

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 91f9053c8401..213ca0c5921a 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -169,9 +169,11 @@ def create_weights(
         **extra_weight_attrs,
     ) -> None:
         del output_size
+
         prefix = extra_weight_attrs.get("prefix", "")
+        # Depending on prefix and dynamic_bits, bits and pack_factor may be modified.
         self.quant_config.update_bits_and_pack_factor(prefix=prefix)
-        print("wwww", self.quant_config.weight_bits)
+
         output_size_per_partition = sum(output_partition_sizes)
         is_row_parallel = input_size != input_size_per_partition
 

From 4b6375443d34d07994f6cf62a95b58a0b4ce2dba Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 2 Aug 2024 03:26:20 +0000
Subject: [PATCH 05/56] cleanup

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 213ca0c5921a..d91c098becea 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -202,6 +202,7 @@ def create_weights(
             # shard the scales in TP>1 case.
             scales_and_zp_input_dim = 0
             scales_and_zp_size = input_size_per_partition // group_size
+
         # Quantized weights
         qweight = Parameter(
             torch.empty(

From 90258d2856339e7e0412b4e4c4b8137cba69db4f Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 2 Aug 2024 03:26:58 +0000
Subject: [PATCH 06/56] cleanup

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index d91c098becea..d8a9c1b7b52f 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -45,12 +45,10 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
                                      is_sym=self.is_sym)
 
     def update_bits_and_pack_factor(self, prefix: str):
-        print("lll", prefix)
         bits = self.weight_bits
         # check for variable/dynamic bits
         if len(self.dynamic_bits) > 0 and prefix:
             for pattern, dym_bits in self.dynamic_bits.items():
-                print("re.match(pattern, prefix)",re.match(pattern, prefix), prefix)
                 if re.match(pattern, prefix):
                     bits = dym_bits
                     break

From a5d3c8b4ef6f0f3526c355288d6929074ba5408a Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 2 Aug 2024 03:23:30 +0000
Subject: [PATCH 07/56] cleanup

cleanup

cleanup

cleanup
---
 .../layers/quantization/gptq_marlin.py        | 41 +++++++++++--------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index cbcead0c7c20..d8a9c1b7b52f 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,4 +1,5 @@
 import re
+from copy import deepcopy
 from typing import Any, Dict, List, Optional
 
 import torch
@@ -31,33 +32,32 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
             desc_act = False
 
         self.dynamic_bits = dynamic_bits
-        self._weight_bits = weight_bits
-        self._pack_factor = 32 // self._weight_bits  # packed into int32
+        self.weight_bits = weight_bits
+        self.pack_factor = 32 // self.weight_bits  # packed into int32
         self.group_size = group_size
         self.desc_act = desc_act
         self.is_sym = is_sym
         self.lm_head_quantized = lm_head_quantized
 
         # Verify supported on platform.
-        verify_gptq_marlin_supported(num_bits=self._weight_bits,
+        verify_gptq_marlin_supported(num_bits=self.weight_bits,
                                      group_size=self.group_size,
                                      is_sym=self.is_sym)
 
-    def get_weight_bits(self, prefix: str):
-        bits = self._weight_bits
+    def update_bits_and_pack_factor(self, prefix: str):
+        bits = self.weight_bits
         # check for variable/dynamic bits
         if len(self.dynamic_bits) > 0 and prefix:
             for pattern, dym_bits in self.dynamic_bits.items():
                 if re.match(pattern, prefix):
                     bits = dym_bits
                     break
-        return bits
-
-    def get_pack_factor(self, prefix: str):
-        return 32 // self.get_weight_bits(prefix)  # packed into int32
+        if bits != self.weight_bits:
+            self.weight_bits = bits
+            self.pack_factor = 32 // self.weight_bits  # packed into int32
 
     def __repr__(self) -> str:
-        return (f"GPTQMarlinConfig(weight_bits={self._weight_bits}, "
+        return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
                 f"group_size={self.group_size}, "
                 f"desc_act={self.desc_act}, "
                 f"lm_head_quantized={self.lm_head_quantized}), "
@@ -154,7 +154,7 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
     """
 
     def __init__(self, quant_config: GPTQMarlinConfig) -> None:
-        self.quant_config = quant_config
+        self.quant_config = deepcopy(quant_config)
 
     def create_weights(
         self,
@@ -167,7 +167,11 @@ def create_weights(
         **extra_weight_attrs,
     ) -> None:
         del output_size
-        self.prefix = extra_weight_attrs.get("prefix", "")
+
+        prefix = extra_weight_attrs.get("prefix", "")
+        # Depending on prefix and dynamic_bits, bits and pack_factor may be modified.
+        self.quant_config.update_bits_and_pack_factor(prefix=prefix)
+
         output_size_per_partition = sum(output_partition_sizes)
         is_row_parallel = input_size != input_size_per_partition
 
@@ -196,10 +200,11 @@ def create_weights(
             # shard the scales in TP>1 case.
             scales_and_zp_input_dim = 0
             scales_and_zp_size = input_size_per_partition // group_size
+
         # Quantized weights
         qweight = Parameter(
             torch.empty(
-                input_size_per_partition // self.quant_config.get_pack_factor(self.prefix),
+                input_size_per_partition // self.quant_config.pack_factor,
                 output_size_per_partition,
                 dtype=torch.int32,
             ),
@@ -212,7 +217,7 @@ def create_weights(
                 "input_dim": 0,
                 "output_dim": 1,
                 "packed_dim": 0,
-                "pack_factor": self.quant_config.get_pack_factor(self.prefix),
+                "pack_factor": self.quant_config.pack_factor,
             },
         )
 
@@ -255,7 +260,7 @@ def create_weights(
         qzeros = Parameter(
             torch.empty(
                 scales_and_zp_size,
-                output_size_per_partition // self.quant_config.get_pack_factor(self.prefix),
+                output_size_per_partition // self.quant_config.pack_factor,
                 dtype=torch.int32,
                 device="meta",
             ),
@@ -268,7 +273,7 @@ def create_weights(
                 "input_dim": scales_and_zp_input_dim,
                 "output_dim": 1,
                 "packed_dim": 1,
-                "pack_factor": self.quant_config.get_pack_factor(self.prefix),
+                "pack_factor": self.quant_config.pack_factor,
             },
         )
 
@@ -310,7 +315,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             perm=layer.g_idx_sort_indices,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.get_weight_bits(self.prefix))
+            num_bits=self.quant_config.weight_bits)
         replace_tensor(layer, "qweight", marlin_qweight)
 
         # Permute scales from autogptq format to marlin format.
@@ -336,7 +341,7 @@ def apply(
             g_idx=layer.g_idx,
             g_idx_sort_indices=layer.g_idx_sort_indices,
             workspace=layer.workspace,
-            num_bits=self.quant_config.get_weight_bits(self.prefix),
+            num_bits=self.quant_config.weight_bits,
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
             is_k_full=layer.is_k_full,

From 5682124103f7706a92cd3f9bd69d00f3c74f67c2 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 2 Aug 2024 08:48:54 +0000
Subject: [PATCH 08/56] load "dynamic" field from config

---
 .../layers/quantization/gptq_marlin.py        | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index d8a9c1b7b52f..a6c28a90671b 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,6 +1,6 @@
 import re
 from copy import deepcopy
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 from torch.nn.parameter import Parameter
@@ -25,13 +25,13 @@ class GPTQMarlinConfig(QuantizationConfig):
     """Config class for GPTQ Marlin"""
 
     def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
-                 is_sym: bool, lm_head_quantized: bool, dynamic_bits: Dict[str, int]) -> None:
+                 is_sym: bool, lm_head_quantized: bool, dynamic: Dict[str, Dict[str, Union[int, bool]]]) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
             desc_act = False
 
-        self.dynamic_bits = dynamic_bits
+        self.dynamic = dynamic
         self.weight_bits = weight_bits
         self.pack_factor = 32 // self.weight_bits  # packed into int32
         self.group_size = group_size
@@ -44,13 +44,16 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
                                      group_size=self.group_size,
                                      is_sym=self.is_sym)
 
-    def update_bits_and_pack_factor(self, prefix: str):
+    def update_config(self, prefix: str):
         bits = self.weight_bits
-        # check for variable/dynamic bits
-        if len(self.dynamic_bits) > 0 and prefix:
-            for pattern, dym_bits in self.dynamic_bits.items():
+        # check for variable/dynamic config
+        if len(self.dynamic) > 0 and prefix:
+            for pattern, dym in self.dynamic.items():
                 if re.match(pattern, prefix):
-                    bits = dym_bits
+                    bits = dym.get("bits", bits)
+                    self.group_size = dym.get("group_size", self.group_size)
+                    self.desc_act = dym.get("bits", self.desc_act)
+                    self.is_sym = dym.get("is_sym", self.is_sym)
                     break
         if bits != self.weight_bits:
             self.weight_bits = bits
@@ -61,7 +64,7 @@ def __repr__(self) -> str:
                 f"group_size={self.group_size}, "
                 f"desc_act={self.desc_act}, "
                 f"lm_head_quantized={self.lm_head_quantized}), "
-                f"dynamic_bits={self.dynamic_bits}")
+                f"dynamic={self.dynamic}")
 
     @classmethod
     def get_name(cls) -> str:
@@ -81,7 +84,7 @@ def get_config_filenames(cls) -> List[str]:
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
-        dynamic_bits = cls.get_from_keys_or(config, ["dynamic_bits"], default={})
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
@@ -89,7 +92,7 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
                                                  default=False)
         return cls(weight_bits, group_size, desc_act, is_sym,
-                   lm_head_quantized, dynamic_bits)
+                   lm_head_quantized, dynamic)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
@@ -169,8 +172,8 @@ def create_weights(
         del output_size
 
         prefix = extra_weight_attrs.get("prefix", "")
-        # Depending on prefix and dynamic_bits, bits and pack_factor may be modified.
-        self.quant_config.update_bits_and_pack_factor(prefix=prefix)
+        # Depending on prefix and dynamic, some arguments may be modified.
+        self.quant_config.update_config(prefix=prefix)
 
         output_size_per_partition = sum(output_partition_sizes)
         is_row_parallel = input_size != input_size_per_partition

From d651668b14b80affb96385cc31227651b05f04b0 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 2 Aug 2024 17:30:51 +0000
Subject: [PATCH 09/56] fix key error: change "is_sym" to "sym"

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index a6c28a90671b..d0efd97e8907 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -53,7 +53,7 @@ def update_config(self, prefix: str):
                     bits = dym.get("bits", bits)
                     self.group_size = dym.get("group_size", self.group_size)
                     self.desc_act = dym.get("bits", self.desc_act)
-                    self.is_sym = dym.get("is_sym", self.is_sym)
+                    self.is_sym = dym.get("sym", self.is_sym)
                     break
         if bits != self.weight_bits:
             self.weight_bits = bits

From e9ae8f5fa7168de585396b57c34a22a102459f26 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 6 Aug 2024 03:11:10 +0000
Subject: [PATCH 10/56] update quant_type

---
 .../model_executor/layers/quantization/gptq_marlin.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 8dc28c6f6264..71654fac4edf 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -40,6 +40,7 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
 
         self.dynamic = dynamic
         self.weight_bits = weight_bits
+        self.is_sym = is_sym
         self.pack_factor = 32 // weight_bits  # packed into int32
         self.group_size = group_size
         self.desc_act = desc_act
@@ -66,9 +67,13 @@ def update_config(self, prefix: str):
                     self.desc_act = dym.get("bits", self.desc_act)
                     self.is_sym = dym.get("sym", self.is_sym)
                     break
-        if bits != self.weight_bits:
-            self.weight_bits = bits
-            self.pack_factor = 32 // self.weight_bits  # packed into int32
+
+        self.pack_factor = 32 // bits  # packed into int32
+        if (bits, self.is_sym) not in self.TYPE_MAP:
+            raise ValueError("Unsupported quantization config: "
+                             f"bits={bits}, sym={self.is_sym}")
+
+        self.quant_type = self.TYPE_MAP[(bits, self.is_sym)]
 
     def __repr__(self) -> str:
         return (f"GPTQMarlinConfig(quant_type={self.quant_type}, "

From 19d77723aa88c1da28f9f98a64fde3c16a7e4c30 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 13:49:57 +0800
Subject: [PATCH 11/56] update

---
 .../layers/quantization/gptq_marlin.py        | 558 +++++++++++++-----
 1 file changed, 398 insertions(+), 160 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 71654fac4edf..9c125dd5149e 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,22 +1,24 @@
 import re
 from copy import deepcopy
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Union
 
 import torch
-from torch.nn.parameter import Parameter
-
+import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               set_weight_attrs)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    apply_gptq_marlin_linear, check_marlin_supported, marlin_is_k_full,
-    marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales,
-    marlin_repeat_scales_on_all_ranks, marlin_sort_g_idx, replace_tensor,
-    verify_marlin_supported, verify_marlin_supports_shape)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase, UnquantizedLinearMethod, set_weight_attrs
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.kernels import MPLinearLayerConfig, choose_mp_linear_kernel
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (check_marlin_supported,
+                                                                        marlin_moe_permute_scales,
+                                                                        marlin_repeat_scales_on_all_ranks,
+                                                                        verify_marlin_supported)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter, GroupQuantScaleParameter,
+                                           PackedColumnParameter, PackedvLLMParameter, RowvLLMParameter)
+from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
@@ -31,8 +33,15 @@ class GPTQMarlinConfig(QuantizationConfig):
         (8, True): scalar_types.uint8b128,
     }
 
-    def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
-                 is_sym: bool, lm_head_quantized: bool, dynamic: Dict[str, Dict[str, Union[int, bool]]]) -> None:
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        lm_head_quantized: bool,
+        dynamic: Dict[str, Dict[str, Union[int, bool]]]
+    ) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
@@ -41,6 +50,7 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
         self.dynamic = dynamic
         self.weight_bits = weight_bits
         self.is_sym = is_sym
+
         self.pack_factor = 32 // weight_bits  # packed into int32
         self.group_size = group_size
         self.desc_act = desc_act
@@ -52,21 +62,14 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
 
         self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
 
-        # Verify supported on platform.
-        verify_marlin_supported(quant_type=self.quant_type,
-                                group_size=self.group_size)
-
     def update_config(self, prefix: str):
         bits = self.weight_bits
         # check for variable/dynamic config
-        if len(self.dynamic) > 0 and prefix:
-            for pattern, dym in self.dynamic.items():
-                if re.match(pattern, prefix):
-                    bits = dym.get("bits", bits)
-                    self.group_size = dym.get("group_size", self.group_size)
-                    self.desc_act = dym.get("bits", self.desc_act)
-                    self.is_sym = dym.get("sym", self.is_sym)
-                    break
+        if self.dynamic and len(self.dynamic) > 0 and prefix:
+            bits = self.dynamic_get(prefix, "bits", bits)
+            self.group_size = self.dynamic_get(prefix, "group_size", self.group_size)
+            self.desc_act = self.dynamic_get(prefix, "desc_act", self.desc_act)
+            self.is_sym = self.dynamic_get(prefix, "sym", self.is_sym)
 
         self.pack_factor = 32 // bits  # packed into int32
         if (bits, self.is_sym) not in self.TYPE_MAP:
@@ -131,24 +134,42 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
+    def dynamic_get(self, layer_name: str, key: str = None, default_value: Union[int, bool] = None) -> Union[Dict, int, bool]:
+        for pattern, pattern_dict in self.dynamic.items():
+            if pattern.startswith("-:"):
+                if re.match(pattern.removeprefix("-:"), layer_name):
+                    return False
+            elif re.match(pattern.removeprefix("+:"), layer_name):
+                if key is None:
+                    return pattern_dict
+                else:
+                    return pattern_dict.get(key, default_value)
+        return default_value
+
     def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["GPTQMarlinLinearMethod"]:
-        if (isinstance(layer, LinearBase) or
-            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
-            return GPTQMarlinLinearMethod(self)
+                         prefix: str
+                         ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod", UnquantizedLinearMethod]]:
+        if self.dynamic and self.dynamic_get(layer_name=prefix) == False:  # noqa: E712
+            return UnquantizedLinearMethod()
+
+        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
+                                             and self.lm_head_quantized):
+            return GPTQMarlinLinearMethod(self, prefix=prefix)
+        elif isinstance(layer, FusedMoE):
+            return GPTQMarlinMoEMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
     @classmethod
     def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
         # Extract data from quant config.
         quant_method = quant_config.get("quant_method", "").lower()
-        num_bits = quant_config.get("bits", None)
-        group_size = quant_config.get("group_size", None)
-        sym = quant_config.get("sym", None)
-        desc_act = quant_config.get("desc_act", None)
+        num_bits = quant_config.get("bits")
+        group_size = quant_config.get("group_size")
+        sym = quant_config.get("sym")
+        desc_act = quant_config.get("desc_act")
+
+        if not current_platform.is_cuda():
+            return False
 
         if quant_method != "gptq":
             return False
@@ -162,8 +183,7 @@ def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
             return False
 
         return check_marlin_supported(quant_type=cls.TYPE_MAP[(num_bits, sym)],
-                                      group_size=group_size,
-                                      min_capability=cls.get_min_capability())
+                                      group_size=group_size)
 
 
 class GPTQMarlinLinearMethod(LinearMethodBase):
@@ -173,8 +193,15 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
         quant_config: The GPTQ Marlin quantization config.
     """
 
-    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+    _kernel_backends_being_used: Set[str] = set()
+
+    def __init__(self, quant_config: GPTQMarlinConfig, prefix: str) -> None:
         self.quant_config = deepcopy(quant_config)
+        self.prefix = prefix
+
+        # Verify supported on platform.
+        verify_marlin_supported(quant_type=self.quant_config.quant_type,
+                                group_size=self.quant_config.group_size)
 
     def create_weights(
         self,
@@ -186,14 +213,30 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ) -> None:
-        del output_size
-
-        prefix = extra_weight_attrs.get("prefix", "")
         # Depending on prefix and dynamic, some arguments may be modified.
-        self.quant_config.update_config(prefix=prefix)
+        self.quant_config.update_config(prefix=self.prefix)
 
         output_size_per_partition = sum(output_partition_sizes)
         is_row_parallel = input_size != input_size_per_partition
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        mp_linear_kernel_config = MPLinearLayerConfig(
+            full_weight_shape=(input_size, output_size),
+            partition_weight_shape=\
+                (input_size_per_partition, output_size_per_partition),
+            weight_type=self.quant_config.quant_type,
+            act_type=params_dtype,
+            group_size=self.quant_config.group_size,
+            zero_points=False,
+            has_g_idx=self.quant_config.desc_act
+        )
+
+        kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config)
+
+        if kernel_type.__name__ not in self._kernel_backends_being_used:
+            logger.info("Using %s for GPTQMarlinLinearMethod",
+                        kernel_type.__name__)
+            self._kernel_backends_being_used.add(kernel_type.__name__)
 
         # Normalize group_size
         if self.quant_config.group_size != -1:
@@ -201,12 +244,6 @@ def create_weights(
         else:
             group_size = input_size
 
-        verify_marlin_supports_shape(
-            output_size_per_partition=output_size_per_partition,
-            input_size_per_partition=input_size_per_partition,
-            input_size=input_size,
-            group_size=group_size)
-
         # Determine sharding
         if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act,
                                              self.quant_config.group_size,
@@ -222,147 +259,348 @@ def create_weights(
             scales_and_zp_size = input_size_per_partition // group_size
 
         # Quantized weights
-        qweight = Parameter(
-            torch.empty(
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
                 input_size_per_partition // self.quant_config.pack_factor,
                 output_size_per_partition,
                 dtype=torch.int32,
             ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            qweight,
-            {
-                **extra_weight_attrs,
-                "input_dim": 0,
-                "output_dim": 1,
-                "packed_dim": 0,
-                "pack_factor": self.quant_config.pack_factor,
-            },
-        )
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader)
 
         # Activation order
-        g_idx = Parameter(
+        g_idx = RowvLLMParameter(data=torch.empty(
+            input_size_per_partition,
+            dtype=torch.int32,
+        ),
+                                 input_dim=0,
+                                 weight_loader=weight_loader)
+
+        qzeros_args = {
+            "data":
             torch.empty(
-                input_size_per_partition,
+                scales_and_zp_size,
+                output_size_per_partition // self.quant_config.pack_factor,
                 dtype=torch.int32,
             ),
-            requires_grad=False,
-        )
-        # Ignore warning from fused linear layers such as QKVParallelLinear.
-        set_weight_attrs(
-            g_idx,
-            {
-                **extra_weight_attrs, "input_dim": 0,
-                "ignore_warning": True
-            },
-        )
-
-        # Scales
-        scales = Parameter(
+            "weight_loader":
+            weight_loader
+        }
+        weight_scale_args = {
+            "data":
             torch.empty(
                 scales_and_zp_size,
                 output_size_per_partition,
                 dtype=params_dtype,
             ),
+            "weight_loader":
+            weight_loader
+        }
+
+        if scales_and_zp_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1,
+                                                **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        else:
+            scales = GroupQuantScaleParameter(output_dim=1,
+                                              input_dim=0,
+                                              **weight_scale_args)
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("qzeros", qzeros)
+
+        self.kernel = kernel_type(mp_linear_kernel_config,
+                                  w_q_param_name="qweight",
+                                  w_s_param_name="scales",
+                                  w_zp_param_name="qzeros",
+                                  w_gidx_param_name="g_idx")
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.kernel.apply_weights(layer, x, bias)
+
+
+class GPTQMarlinMoEMethod(FusedMoEMethodBase):
+    """MoE Marlin method with quantization."""
+
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        # Currently assuming is_k_full is always True
+        # (input size per partition is the same as full input size)
+        # Supports only sym for now (no zp)
+        if self.quant_config.group_size != -1:
+            scales_size13 = hidden_size // self.quant_config.group_size
+            scales_size2 = intermediate_size // self.quant_config.group_size
+            strategy = FusedMoeWeightScaleSupported.GROUP.value
+        else:
+            scales_size13 = 1
+            scales_size2 = 1
+            strategy = FusedMoeWeightScaleSupported.CHANNEL.value
+
+        extra_weight_attrs.update({
+            "quant_method": strategy,
+            "is_transposed": True
+        })
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size // self.quant_config.pack_factor,
+                2 * intermediate_size,
+                dtype=torch.int32,
+            ),
             requires_grad=False,
         )
-        set_weight_attrs(
-            scales,
-            {
-                **extra_weight_attrs,
-                "input_dim": scales_and_zp_input_dim,
-                "output_dim": 1,
-            },
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size // self.quant_config.pack_factor,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
         )
-
-        # Quantized zero-points
-        qzeros = Parameter(
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+        # up_proj scales
+        w13_scales = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size,
+                        dtype=torch.half),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+        # down_proj scales
+        w2_scales = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size,
+                        dtype=torch.half),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+        # up_proj scales
+        w13_qzeros = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size13,
+                        2 * intermediate_size // self.quant_config.pack_factor,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qzeros", w13_qzeros)
+        set_weight_attrs(w13_qzeros, extra_weight_attrs)
+        # down_proj scales
+        w2_qzeros = torch.nn.Parameter(
+            torch.empty(num_experts,
+                        scales_size2,
+                        hidden_size // self.quant_config.pack_factor,
+                        dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qzeros", w2_qzeros)
+        set_weight_attrs(w2_qzeros, extra_weight_attrs)
+        w13_g_idx = torch.nn.Parameter(
             torch.empty(
-                scales_and_zp_size,
-                output_size_per_partition // self.quant_config.pack_factor,
+                num_experts,
+                hidden_size,
                 dtype=torch.int32,
-                device="meta",
             ),
             requires_grad=False,
         )
-        set_weight_attrs(
-            qzeros,
-            {
-                **extra_weight_attrs,
-                "input_dim": scales_and_zp_input_dim,
-                "output_dim": 1,
-                "packed_dim": 1,
-                "pack_factor": self.quant_config.pack_factor,
-            },
+        layer.register_parameter("w13_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
         )
+        layer.register_parameter("w2_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices",
+                                 w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices",
+                                 w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
 
-        layer.register_parameter("qweight", qweight)
-        layer.register_parameter("g_idx", g_idx)
-        layer.register_parameter("scales", scales)
-        layer.register_parameter("qzeros", qzeros)
-        layer.input_size_per_partition = input_size_per_partition
-        layer.output_size_per_partition = output_size_per_partition
-        layer.input_size = input_size
-        layer.is_k_full = marlin_is_k_full(self.quant_config.desc_act,
-                                           is_row_parallel)
-
-    # Checkpoints are serialized in AutoGPTQ format, which is different from the
-    # marlin format. This function is called after the weights are loaded.
-    # Here, we handle the repacking, including the activation reordering case.
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        device = layer.qweight.device
 
-        # Allocate marlin workspace
-        layer.workspace = marlin_make_workspace(
-            layer.output_size_per_partition, device)
-
-        # Handle sorting for activation reordering if needed.
+        # Process act_order
         if self.quant_config.desc_act:
-            g_idx, g_idx_sort_indices = marlin_sort_g_idx(layer.g_idx)
-            layer.g_idx_sort_indices = g_idx_sort_indices
-            replace_tensor(layer, "g_idx", g_idx)
+            # Get sorting based on g_idx
+            num_experts = layer.w13_g_idx.shape[0]
+            w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
+            w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
+            w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
+            w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
+            for e in range(num_experts):
+                w13_g_idx_sort_indices[e] = torch.argsort(
+                    layer.w13_g_idx[e]).to(torch.int32)
+                w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to(
+                    torch.int32)
+                w13_sorted_g_idx[e] = layer.w13_g_idx[e][
+                    w13_g_idx_sort_indices[e]]
+                w2_sorted_g_idx[e] = layer.w2_g_idx[e][
+                    w2_g_idx_sort_indices[e]]
+            replace_parameter(layer, "w13_g_idx", w13_sorted_g_idx)
+            replace_parameter(layer, "w2_g_idx", w2_sorted_g_idx)
+            replace_parameter(layer, "w13_g_idx_sort_indices",
+                              w13_g_idx_sort_indices)
+            replace_parameter(layer, "w2_g_idx_sort_indices",
+                              w2_g_idx_sort_indices)
         else:
-            layer.g_idx = marlin_make_empty_g_idx(device)
-            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
-
-        # No zero-point
-        layer.zp = marlin_make_empty_g_idx(device)
-
-        # Repack weights from autogptq format to marlin format.
-        marlin_qweight = ops.gptq_marlin_repack(
-            layer.qweight,
-            perm=layer.g_idx_sort_indices,
-            size_k=layer.input_size_per_partition,
-            size_n=layer.output_size_per_partition,
-            num_bits=self.quant_config.quant_type.size_bits)
-        replace_tensor(layer, "qweight", marlin_qweight)
-
-        # Permute scales from autogptq format to marlin format.
-        marlin_scales = marlin_permute_scales(
-            layer.scales,
-            size_k=(layer.input_size if self.quant_config.desc_act else
-                    layer.input_size_per_partition),
-            size_n=layer.output_size_per_partition,
-            group_size=self.quant_config.group_size)
-        replace_tensor(layer, "scales", marlin_scales)
+            # Reset g_idx related tensors
+            num_experts = layer.w13_g_idx.shape[0]
+            device = layer.w13_g_idx.device
+            layer.w13_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32,
+                            device=device),
+                requires_grad=False,
+            )
+        # Repack weights
+        marlin_w13_qweight = ops.gptq_marlin_moe_repack(
+            layer.w13_qweight,
+            layer.w13_g_idx_sort_indices,
+            layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w13_qweight.shape[2],
+            self.quant_config.quant_type.size_bits,
+        )
+        replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
+        marlin_w2_qweight = ops.gptq_marlin_moe_repack(
+            layer.w2_qweight,
+            layer.w2_g_idx_sort_indices,
+            layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w2_qweight.shape[2],
+            self.quant_config.quant_type.size_bits,
+        )
+        replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
+        # Repack scales
+        marlin_w13_scales = marlin_moe_permute_scales(
+            s=layer.w13_scales,
+            size_k=layer.intermediate_size_per_partition,
+            size_n=layer.w13_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_parameter(layer, "w13_scales", marlin_w13_scales)
+        marlin_w2_scales = marlin_moe_permute_scales(
+            s=layer.w2_scales,
+            size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
+            size_n=layer.w2_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_parameter(layer, "w2_scales", marlin_w2_scales)
 
     def apply(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool = True,
+        use_grouped_topk: bool = False,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
     ) -> torch.Tensor:
-        return apply_gptq_marlin_linear(
-            input=x,
-            weight=layer.qweight,
-            weight_scale=layer.scales,
-            weight_zp=layer.zp,
-            g_idx=layer.g_idx,
-            g_idx_sort_indices=layer.g_idx_sort_indices,
-            workspace=layer.workspace,
-            wtype=self.quant_config.quant_type,
-            output_size_per_partition=layer.output_size_per_partition,
-            input_size_per_partition=layer.input_size_per_partition,
-            is_k_full=layer.is_k_full,
-            bias=bias)
+        # The input must currently be float16
+        orig_dtype = x.dtype
+        x = x.half()
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=None)
+
+        return torch.ops.vllm.fused_marlin_moe(
+            x,
+            layer.w13_qweight,
+            layer.w2_qweight,
+            layer.w13_scales,
+            layer.w2_scales,
+            router_logits,
+            topk_weights,
+            topk_ids,
+            g_idx1=layer.w13_g_idx,
+            g_idx2=layer.w2_g_idx,
+            sort_indices1=layer.w13_g_idx_sort_indices,
+            sort_indices2=layer.w2_g_idx_sort_indices,
+            num_bits=self.quant_config.quant_type.size_bits,
+        ).to(orig_dtype)
\ No newline at end of file

From 856532804685b47661708d352c4850979f047ee7 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 16:15:11 +0800
Subject: [PATCH 12/56] fix judgment error

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 94187214b741..14dd51df62d7 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -157,11 +157,11 @@ def dynamic_get(self, layer_name: str, key: str = None, default_value: Union[int
     def get_quant_method(
             self, layer: torch.nn.Module, prefix: str
     ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod", UnquantizedLinearMethod]]:
-        if self.dynamic and self.dynamic_get(layer_name=prefix) == False:  # noqa: E712
-            return UnquantizedLinearMethod()
-
         if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
                                              and self.lm_head_quantized):
+            if self.dynamic and self.dynamic_get(layer_name=prefix) == False:  # noqa: E712
+                return UnquantizedLinearMethod()
+            
             return GPTQMarlinLinearMethod(self, prefix=prefix)
         elif isinstance(layer, FusedMoE):
             return GPTQMarlinMoEMethod(self)

From 84ada54655b77dd2810d927cad3a4683bb1a0043 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 16:19:32 +0800
Subject: [PATCH 13/56] cleanup

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 14dd51df62d7..18d36a1b230f 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -10,7 +10,8 @@
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               set_weight_attrs, UnquantizedLinearMethod)
+                                               set_weight_attrs,
+                                               UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.kernels import (
@@ -141,7 +142,8 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def dynamic_get(self, layer_name: str, key: str = None, default_value: Union[int, bool] = None) -> Union[Dict, int, bool]:
+    def dynamic_get(self, layer_name: str, key: Optional[str] = None, default_value: Union[int, bool, None] = None
+                    ) -> Union[Dict, int, bool]:
         for pattern, pattern_dict in self.dynamic.items():
             if pattern.startswith("-:"):
                 if re.match(pattern.removeprefix("-:"), layer_name):
@@ -161,7 +163,7 @@ def get_quant_method(
                                              and self.lm_head_quantized):
             if self.dynamic and self.dynamic_get(layer_name=prefix) == False:  # noqa: E712
                 return UnquantizedLinearMethod()
-            
+
             return GPTQMarlinLinearMethod(self, prefix=prefix)
         elif isinstance(layer, FusedMoE):
             return GPTQMarlinMoEMethod(self)

From e81a7da33392a310c670d069263e0be0987d61b0 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 16:28:17 +0800
Subject: [PATCH 14/56] cleanup

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 18d36a1b230f..ed420f77ec53 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -71,11 +71,13 @@ def __init__(
         self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
 
     def update_config(self, prefix: str):
-        bits = self.weight_bits
+        bits: Optional[int] = self.weight_bits
         # check for variable/dynamic config
         if self.dynamic and len(self.dynamic) > 0 and prefix:
             bits = self.dynamic_get(prefix, "bits", bits)
-            self.group_size = self.dynamic_get(prefix, "group_size", self.group_size)
+            group_size = self.dynamic_get(prefix, "group_size", self.group_size)
+            assert group_size is not None
+            self.group_size = group_size
             self.desc_act = self.dynamic_get(prefix, "desc_act", self.desc_act)
             self.is_sym = self.dynamic_get(prefix, "sym", self.is_sym)
 
@@ -158,7 +160,7 @@ def dynamic_get(self, layer_name: str, key: Optional[str] = None, default_value:
 
     def get_quant_method(
             self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod", UnquantizedLinearMethod]]:
+    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]:
         if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
                                              and self.lm_head_quantized):
             if self.dynamic and self.dynamic_get(layer_name=prefix) == False:  # noqa: E712

From 68291ce51551a62367c38b51145a6b89f578bd31 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 16:30:17 +0800
Subject: [PATCH 15/56] cleanup

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index ed420f77ec53..81263ad51d90 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -75,9 +75,8 @@ def update_config(self, prefix: str):
         # check for variable/dynamic config
         if self.dynamic and len(self.dynamic) > 0 and prefix:
             bits = self.dynamic_get(prefix, "bits", bits)
-            group_size = self.dynamic_get(prefix, "group_size", self.group_size)
-            assert group_size is not None
-            self.group_size = group_size
+            self.group_size = self.dynamic_get(prefix, "group_size",
+                                               self.group_size)
             self.desc_act = self.dynamic_get(prefix, "desc_act", self.desc_act)
             self.is_sym = self.dynamic_get(prefix, "sym", self.is_sym)
 
@@ -144,7 +143,8 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def dynamic_get(self, layer_name: str, key: Optional[str] = None, default_value: Union[int, bool, None] = None
+    def dynamic_get(self, layer_name: str, key: Optional[str] = None,
+                    default_value: Union[int, bool, None] = None
                     ) -> Union[Dict, int, bool]:
         for pattern, pattern_dict in self.dynamic.items():
             if pattern.startswith("-:"):

From 78674057820283efac50fe98c1b5dd378b992453 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 16:47:32 +0800
Subject: [PATCH 16/56] cleanup

---
 .../layers/quantization/gptq_marlin.py        | 30 ++++++++-----------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 81263ad51d90..44efedfe92ef 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -10,8 +10,8 @@
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
-                                               set_weight_attrs,
-                                               UnquantizedLinearMethod)
+                                               UnquantizedLinearMethod,
+                                               set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.kernels import (
@@ -41,15 +41,9 @@ class GPTQMarlinConfig(QuantizationConfig):
         (8, True): scalar_types.uint8b128,
     }
 
-    def __init__(
-        self,
-        weight_bits: int,
-        group_size: int,
-        desc_act: bool,
-        is_sym: bool,
-        lm_head_quantized: bool,
-        dynamic: Dict[str, Dict[str, Union[int, bool]]]
-    ) -> None:
+    def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
+                 is_sym: bool, lm_head_quantized: bool,
+                 dynamic: Dict[str, Dict[str, Union[int, bool]]]) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
@@ -143,9 +137,12 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def dynamic_get(self, layer_name: str, key: Optional[str] = None,
-                    default_value: Union[int, bool, None] = None
-                    ) -> Union[Dict, int, bool]:
+    def dynamic_get(
+        self,
+        layer_name: str,
+        key: Optional[str] = None,
+        default_value: Union[int, bool,
+                             None] = None) -> Union[Dict, int, bool]:
         for pattern, pattern_dict in self.dynamic.items():
             if pattern.startswith("-:"):
                 if re.match(pattern.removeprefix("-:"), layer_name):
@@ -157,13 +154,12 @@ def dynamic_get(self, layer_name: str, key: Optional[str] = None,
                     return pattern_dict.get(key, default_value)
         return default_value
 
-
     def get_quant_method(
-            self, layer: torch.nn.Module, prefix: str
+        self, layer: torch.nn.Module, prefix: str
     ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]:
         if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
                                              and self.lm_head_quantized):
-            if self.dynamic and self.dynamic_get(layer_name=prefix) == False:  # noqa: E712
+            if self.dynamic and not self.dynamic_get(layer_name=prefix):
                 return UnquantizedLinearMethod()
 
             return GPTQMarlinLinearMethod(self, prefix=prefix)

From c63ba512cd14b14b408da7807b28ac21090f47f1 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 17:44:03 +0800
Subject: [PATCH 17/56] cleanup

---
 .../layers/quantization/gptq_marlin.py        | 33 ++++++++++++-------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 44efedfe92ef..90a9688020c8 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -65,14 +65,22 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
         self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
 
     def update_config(self, prefix: str):
-        bits: Optional[int] = self.weight_bits
+        bits = self.weight_bits
         # check for variable/dynamic config
         if self.dynamic and len(self.dynamic) > 0 and prefix:
-            bits = self.dynamic_get(prefix, "bits", bits)
-            self.group_size = self.dynamic_get(prefix, "group_size",
-                                               self.group_size)
-            self.desc_act = self.dynamic_get(prefix, "desc_act", self.desc_act)
-            self.is_sym = self.dynamic_get(prefix, "sym", self.is_sym)
+            b = self.dynamic_get(prefix, "bits", bits)
+            if isinstance(b, int):
+                bits = b
+            group_size = self.dynamic_get(prefix, "group_size",
+                                          self.group_size)
+            if isinstance(group_size, int):
+                self.group_size = group_size
+            desc_act = self.dynamic_get(prefix, "desc_act", self.desc_act)
+            if isinstance(desc_act, bool):
+                self.desc_act = desc_act
+            is_sym = self.dynamic_get(prefix, "sym", self.is_sym)
+            if isinstance(is_sym, bool):
+                self.is_sym = is_sym
 
         self.pack_factor = 32 // bits  # packed into int32
         if (bits, self.is_sym) not in self.TYPE_MAP:
@@ -141,8 +149,8 @@ def dynamic_get(
         self,
         layer_name: str,
         key: Optional[str] = None,
-        default_value: Union[int, bool,
-                             None] = None) -> Union[Dict, int, bool]:
+        default_value: Union[int, bool, None] = None
+    ) -> Union[Dict, int, bool, None]:
         for pattern, pattern_dict in self.dynamic.items():
             if pattern.startswith("-:"):
                 if re.match(pattern.removeprefix("-:"), layer_name):
@@ -156,11 +164,14 @@ def dynamic_get(
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]:
+    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod",
+                        UnquantizedLinearMethod]]:
         if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
                                              and self.lm_head_quantized):
-            if self.dynamic and not self.dynamic_get(layer_name=prefix):
-                return UnquantizedLinearMethod()
+            if self.dynamic:
+                result = self.dynamic_get(layer_name=prefix)
+                if result is not None and not result:
+                    return UnquantizedLinearMethod()
 
             return GPTQMarlinLinearMethod(self, prefix=prefix)
         elif isinstance(layer, FusedMoE):

From 5f9b712cc18ec6a71416f3b6b8d2022ac92fdb78 Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Tue, 24 Dec 2024 17:51:15 +0800
Subject: [PATCH 18/56] Update gptq_marlin.py

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 90a9688020c8..ad188ee54f8d 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -232,7 +232,7 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ) -> None:
-        # Depending on prefix and dynamic, some arguments may be modified.
+        # gptqmodel per module/layer dynamic config my override/change base model quant config
         self.quant_config.update_config(prefix=self.prefix)
 
         output_size_per_partition = sum(output_partition_sizes)

From 36925788aade16a66f022dc985d6f4209503b94a Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Tue, 24 Dec 2024 17:59:55 +0800
Subject: [PATCH 19/56] Update gptq_marlin.py

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index ad188ee54f8d..c007aedaf0a8 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -232,7 +232,7 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ) -> None:
-        # gptqmodel per module/layer dynamic config my override/change base model quant config
+        # gptqmodel's dynamic config per module may override base model quant config
         self.quant_config.update_config(prefix=self.prefix)
 
         output_size_per_partition = sum(output_partition_sizes)

From f902b2d08868892451ac769844eed6ee50bcd74e Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 18:01:32 +0800
Subject: [PATCH 20/56] cleanup

---
 .../layers/quantization/gptq_marlin.py        | 56 ++++++++++---------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index ad188ee54f8d..3d95bf5cf9a3 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -43,13 +43,13 @@ class GPTQMarlinConfig(QuantizationConfig):
 
     def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
                  is_sym: bool, lm_head_quantized: bool,
-                 dynamic: Dict[str, Dict[str, Union[int, bool]]]) -> None:
+                 dynamic_cfg: Dict[str, Dict[str, Union[int, bool]]]) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
             desc_act = False
 
-        self.dynamic = dynamic
+        self.dynamic_cfg = dynamic_cfg
         self.weight_bits = weight_bits
         self.is_sym = is_sym
 
@@ -66,21 +66,21 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
 
     def update_config(self, prefix: str):
         bits = self.weight_bits
-        # check for variable/dynamic config
-        if self.dynamic and len(self.dynamic) > 0 and prefix:
-            b = self.dynamic_get(prefix, "bits", bits)
-            if isinstance(b, int):
-                bits = b
-            group_size = self.dynamic_get(prefix, "group_size",
-                                          self.group_size)
-            if isinstance(group_size, int):
-                self.group_size = group_size
-            desc_act = self.dynamic_get(prefix, "desc_act", self.desc_act)
-            if isinstance(desc_act, bool):
-                self.desc_act = desc_act
-            is_sym = self.dynamic_get(prefix, "sym", self.is_sym)
-            if isinstance(is_sym, bool):
-                self.is_sym = is_sym
+
+        b = self.gptqmodel_dynamic_config(prefix, "bits", bits)
+        if isinstance(b, int):
+            bits = b
+        group_size = self.gptqmodel_dynamic_config(prefix, "group_size",
+                                                   self.group_size)
+        if isinstance(group_size, int):
+            self.group_size = group_size
+        desc_act = self.gptqmodel_dynamic_config(prefix, "desc_act",
+                                                 self.desc_act)
+        if isinstance(desc_act, bool):
+            self.desc_act = desc_act
+        is_sym = self.gptqmodel_dynamic_config(prefix, "sym", self.is_sym)
+        if isinstance(is_sym, bool):
+            self.is_sym = is_sym
 
         self.pack_factor = 32 // bits  # packed into int32
         if (bits, self.is_sym) not in self.TYPE_MAP:
@@ -94,7 +94,7 @@ def __repr__(self) -> str:
                 f"group_size={self.group_size}, "
                 f"desc_act={self.desc_act}, "
                 f"lm_head_quantized={self.lm_head_quantized}), "
-                f"dynamic={self.dynamic}")
+                f"dynamic_cfg={self.dynamic_cfg}")
 
     @classmethod
     def get_name(cls) -> str:
@@ -114,7 +114,7 @@ def get_config_filenames(cls) -> List[str]:
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
-        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        dynamic_cfg = cls.get_from_keys_or(config, ["dynamic"], default={})
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
@@ -122,7 +122,7 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
                                                  default=False)
         return cls(weight_bits, group_size, desc_act, is_sym,
-                   lm_head_quantized, dynamic)
+                   lm_head_quantized, dynamic_cfg)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
@@ -145,13 +145,13 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def dynamic_get(
+    def gptqmodel_dynamic_config(
         self,
         layer_name: str,
         key: Optional[str] = None,
         default_value: Union[int, bool, None] = None
     ) -> Union[Dict, int, bool, None]:
-        for pattern, pattern_dict in self.dynamic.items():
+        for pattern, pattern_dict in self.dynamic_cfg.items():
             if pattern.startswith("-:"):
                 if re.match(pattern.removeprefix("-:"), layer_name):
                     return False
@@ -168,8 +168,8 @@ def get_quant_method(
                         UnquantizedLinearMethod]]:
         if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
                                              and self.lm_head_quantized):
-            if self.dynamic:
-                result = self.dynamic_get(layer_name=prefix)
+            if self.dynamic_cfg:
+                result = self.gptqmodel_dynamic_config(layer_name=prefix)
                 if result is not None and not result:
                     return UnquantizedLinearMethod()
 
@@ -218,6 +218,11 @@ def __init__(self, quant_config: GPTQMarlinConfig, prefix: str) -> None:
         self.quant_config = deepcopy(quant_config)
         self.prefix = prefix
 
+        if len(self.quant_config.dynamic_cfg) > 0 and self.prefix:
+            # gptqmodel per module/layer dynamic config my override/change base
+            # model quant config
+            self.quant_config.update_config(prefix=self.prefix)
+
         # Verify supported on platform.
         verify_marlin_supported(quant_type=self.quant_config.quant_type,
                                 group_size=self.quant_config.group_size)
@@ -232,9 +237,6 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ) -> None:
-        # gptqmodel per module/layer dynamic config my override/change base model quant config
-        self.quant_config.update_config(prefix=self.prefix)
-
         output_size_per_partition = sum(output_partition_sizes)
         is_row_parallel = input_size != input_size_per_partition
         weight_loader = extra_weight_attrs.get("weight_loader")

From 9b9d7e3fb359c60ebe28be26133c9fa3ceb63958 Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Tue, 24 Dec 2024 18:04:33 +0800
Subject: [PATCH 21/56] Update gptq_marlin.py

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 4a67b6ebd3c4..a369dc99b7ee 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -152,9 +152,11 @@ def gptqmodel_dynamic_config(
         default_value: Union[int, bool, None] = None
     ) -> Union[Dict, int, bool, None]:
         for pattern, pattern_dict in self.dynamic_cfg.items():
+            # negative match: matched modules are excluded from quantization
             if pattern.startswith("-:"):
                 if re.match(pattern.removeprefix("-:"), layer_name):
                     return False
+            # positive match: matched modules have quant properties overriding base quant config
             elif re.match(pattern.removeprefix("+:"), layer_name):
                 if key is None:
                     return pattern_dict

From 055913782013a6c58482ee33008187ce5c4a5a11 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 18:09:17 +0800
Subject: [PATCH 22/56] cleanup

---
 .../layers/quantization/gptq_marlin.py        | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 4a67b6ebd3c4..5be7b8a0c683 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -41,9 +41,15 @@ class GPTQMarlinConfig(QuantizationConfig):
         (8, True): scalar_types.uint8b128,
     }
 
-    def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
-                 is_sym: bool, lm_head_quantized: bool,
-                 dynamic_cfg: Dict[str, Dict[str, Union[int, bool]]]) -> None:
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        lm_head_quantized: bool,
+        dynamic_cfg: Dict[str, Dict[str, Union[int, bool]]],
+    ) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
@@ -145,7 +151,7 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def gptqmodel_dynamic_config(
+    def get_dynamic_config(
         self,
         layer_name: str,
         key: Optional[str] = None,
@@ -219,7 +225,7 @@ def __init__(self, quant_config: GPTQMarlinConfig, prefix: str) -> None:
         self.prefix = prefix
 
         if len(self.quant_config.dynamic_cfg) > 0 and self.prefix:
-            # gptqmodel per module/layer dynamic config my override/change base
+            # gptqmodel per module/layer dynamic_cfg my override/change base
             # model quant config
             self.quant_config.update_config(prefix=self.prefix)
 
@@ -237,9 +243,6 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ) -> None:
-        # gptqmodel per module/layer dynamic config my override/change base model quant config
-        self.quant_config.update_config(prefix=self.prefix)
-
         output_size_per_partition = sum(output_partition_sizes)
         is_row_parallel = input_size != input_size_per_partition
         weight_loader = extra_weight_attrs.get("weight_loader")

From 3a2bb94b5881966b380258062fb52b914d66893c Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 18:27:42 +0800
Subject: [PATCH 23/56] cleanup

---
 .../layers/quantization/gptq_marlin.py           | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index eff4aacc93e1..8b8fec76fbe5 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -73,18 +73,17 @@ def __init__(
     def update_config(self, prefix: str):
         bits = self.weight_bits
 
-        b = self.gptqmodel_dynamic_config(prefix, "bits", bits)
+        b = self.get_dynamic_config(prefix, "bits", bits)
         if isinstance(b, int):
             bits = b
-        group_size = self.gptqmodel_dynamic_config(prefix, "group_size",
-                                                   self.group_size)
+        group_size = self.get_dynamic_config(prefix, "group_size",
+                                             self.group_size)
         if isinstance(group_size, int):
             self.group_size = group_size
-        desc_act = self.gptqmodel_dynamic_config(prefix, "desc_act",
-                                                 self.desc_act)
+        desc_act = self.get_dynamic_config(prefix, "desc_act", self.desc_act)
         if isinstance(desc_act, bool):
             self.desc_act = desc_act
-        is_sym = self.gptqmodel_dynamic_config(prefix, "sym", self.is_sym)
+        is_sym = self.get_dynamic_config(prefix, "sym", self.is_sym)
         if isinstance(is_sym, bool):
             self.is_sym = is_sym
 
@@ -162,7 +161,8 @@ def get_dynamic_config(
             if pattern.startswith("-:"):
                 if re.match(pattern.removeprefix("-:"), layer_name):
                     return False
-            # positive match: matched modules have quant properties overriding base quant config
+            # positive match: matched modules have quant properties overriding
+            # base quant config
             elif re.match(pattern.removeprefix("+:"), layer_name):
                 if key is None:
                     return pattern_dict
@@ -177,7 +177,7 @@ def get_quant_method(
         if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
                                              and self.lm_head_quantized):
             if self.dynamic_cfg:
-                result = self.gptqmodel_dynamic_config(layer_name=prefix)
+                result = self.get_dynamic_config(layer_name=prefix)
                 if result is not None and not result:
                     return UnquantizedLinearMethod()
 

From 3c0d45aad38f4896598834e3390f0948ed52070a Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 18:29:11 +0800
Subject: [PATCH 24/56] cleanup

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 8b8fec76fbe5..66574127b985 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -176,7 +176,7 @@ def get_quant_method(
                         UnquantizedLinearMethod]]:
         if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
                                              and self.lm_head_quantized):
-            if self.dynamic_cfg:
+            if len(self.dynamic_cfg) > 0:
                 result = self.get_dynamic_config(layer_name=prefix)
                 if result is not None and not result:
                     return UnquantizedLinearMethod()

From 74b1d4223e93d45e678a0d6efd5ed3965eedca63 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 20:41:00 +0800
Subject: [PATCH 25/56] add test_gptq_dynamic_cfg.py

---
 tests/quantization/test_gptq_dynamic_cfg.py | 46 +++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 tests/quantization/test_gptq_dynamic_cfg.py

diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py
new file mode 100644
index 000000000000..7e0b9d88a9cf
--- /dev/null
+++ b/tests/quantization/test_gptq_dynamic_cfg.py
@@ -0,0 +1,46 @@
+"""Tests whether gptq models with dynamic_cfg quantized can be loaded.
+
+Run `pytest tests/quantization/test_gptq_dynamic_cfg.py --forked`.
+"""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinLinearMethod)
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+
+PROMPT = "On the surface of Mars, we found"
+
+MODELS_QUANT = ["ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg"]
+
+
+@pytest.mark.parametrize("model_id", MODELS_QUANT)
+def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str):
+    vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
+
+    for name, submodule in (vllm_model.model.llm_engine.model_executor.
+                            driver_worker.model_runner.model.named_modules()):
+        if name == 'model.model.layers.0.self_attn.qkv_proj':
+            # The first layer is quantized using bits=4, group_size=128,
+            # desc_act=True
+            assert isinstance(submodule, GPTQMarlinLinearMethod)
+            assert submodule.quant_config.bits == 4
+            assert submodule.quant_config.group_size == 128
+            assert submodule.quant_config.desc_act
+        elif name == 'model.model.layers.1.self_attn.qkv_proj':
+            # The second layer is quantized using bits=8, group_size=32,
+            # desc_act=False
+            assert isinstance(submodule, GPTQMarlinLinearMethod)
+            assert submodule.quant_config.bits == 8
+            assert submodule.quant_config.group_size == 32
+            assert not submodule.quant_config.desc_act
+        elif (name == 'model.model.layers.2.self_attn.qkv_proj'
+              or name == 'model.model.layers.2.mlp.gate_up_proj'):
+            # Other layers are not quantized.
+            assert isinstance(submodule, UnquantizedLinearMethod)
+
+    print(
+        vllm_model.generate_greedy(prompts=["Hello my name is"],
+                                   max_tokens=10)[0][1])
+    del vllm_model

From b0672aea6006d3168ccfbb00ad44e56ee1adeb43 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 20:41:25 +0800
Subject: [PATCH 26/56] cleanup

---
 tests/quantization/test_gptq_dynamic_cfg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py
index 7e0b9d88a9cf..9a7ac159dbed 100644
--- a/tests/quantization/test_gptq_dynamic_cfg.py
+++ b/tests/quantization/test_gptq_dynamic_cfg.py
@@ -6,9 +6,9 @@
 import pytest
 import torch
 
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinLinearMethod)
-from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 
 PROMPT = "On the surface of Mars, we found"
 

From 066f4898a1974baf063ee46eb902f6bb72dafe37 Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Tue, 24 Dec 2024 21:05:38 +0800
Subject: [PATCH 27/56] Update test_gptq_dynamic_cfg.py

---
 tests/quantization/test_gptq_dynamic_cfg.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py
index 9a7ac159dbed..86c290373ed4 100644
--- a/tests/quantization/test_gptq_dynamic_cfg.py
+++ b/tests/quantization/test_gptq_dynamic_cfg.py
@@ -12,6 +12,9 @@
 
 PROMPT = "On the surface of Mars, we found"
 
+# The first layer is quantized using bits=4, group_size=128
+# The second layer is quantized using bits=8, group_size=32
+# All other layers (layer index >= 2) are not quantized
 MODELS_QUANT = ["ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg"]
 
 

From 6dc56a69088e09a36b0c9b2ba119c68366f1682d Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Tue, 24 Dec 2024 21:07:32 +0800
Subject: [PATCH 28/56] Update test_gptq_dynamic_cfg.py

---
 tests/quantization/test_gptq_dynamic_cfg.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py
index 86c290373ed4..7f2ad3dfa854 100644
--- a/tests/quantization/test_gptq_dynamic_cfg.py
+++ b/tests/quantization/test_gptq_dynamic_cfg.py
@@ -25,14 +25,14 @@ def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str):
     for name, submodule in (vllm_model.model.llm_engine.model_executor.
                             driver_worker.model_runner.model.named_modules()):
         if name == 'model.model.layers.0.self_attn.qkv_proj':
-            # The first layer is quantized using bits=4, group_size=128,
+            # The first layer is quantized using bits=4, group_size=128
             # desc_act=True
             assert isinstance(submodule, GPTQMarlinLinearMethod)
             assert submodule.quant_config.bits == 4
             assert submodule.quant_config.group_size == 128
             assert submodule.quant_config.desc_act
         elif name == 'model.model.layers.1.self_attn.qkv_proj':
-            # The second layer is quantized using bits=8, group_size=32,
+            # The second layer is quantized using bits=8, group_size=32
             # desc_act=False
             assert isinstance(submodule, GPTQMarlinLinearMethod)
             assert submodule.quant_config.bits == 8
@@ -40,7 +40,7 @@ def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str):
             assert not submodule.quant_config.desc_act
         elif (name == 'model.model.layers.2.self_attn.qkv_proj'
               or name == 'model.model.layers.2.mlp.gate_up_proj'):
-            # Other layers are not quantized.
+            # All other layers (layer index >= 2) are not quantized
             assert isinstance(submodule, UnquantizedLinearMethod)
 
     print(

From 98a198e0274713e824d301711a661520f15044d3 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 21:08:48 +0800
Subject: [PATCH 29/56] cleanup

---
 tests/quantization/test_gptq_dynamic_cfg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py
index 9a7ac159dbed..72403c68aa29 100644
--- a/tests/quantization/test_gptq_dynamic_cfg.py
+++ b/tests/quantization/test_gptq_dynamic_cfg.py
@@ -12,10 +12,10 @@
 
 PROMPT = "On the surface of Mars, we found"
 
-MODELS_QUANT = ["ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg"]
+MODEL_QUANT = ["ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg"]
 
 
-@pytest.mark.parametrize("model_id", MODELS_QUANT)
+@pytest.mark.parametrize("model_id", MODEL_QUANT)
 def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str):
     vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
 

From c4a29eb5d50217605b295815230803e046339090 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 21:19:45 +0800
Subject: [PATCH 30/56] use PROMPT variable

---
 tests/quantization/test_gptq_dynamic_cfg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py
index 9e1ce642d02d..c00498ab4f33 100644
--- a/tests/quantization/test_gptq_dynamic_cfg.py
+++ b/tests/quantization/test_gptq_dynamic_cfg.py
@@ -44,6 +44,6 @@ def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str):
             assert isinstance(submodule, UnquantizedLinearMethod)
 
     print(
-        vllm_model.generate_greedy(prompts=["Hello my name is"],
+        vllm_model.generate_greedy(prompts=[PROMPT],
                                    max_tokens=10)[0][1])
     del vllm_model

From 25703e3eab4e123a4e3737ab8244e99263287257 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 24 Dec 2024 21:22:53 +0800
Subject: [PATCH 31/56] cleanup

---
 tests/quantization/test_gptq_dynamic_cfg.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py
index c00498ab4f33..abc48e786bd8 100644
--- a/tests/quantization/test_gptq_dynamic_cfg.py
+++ b/tests/quantization/test_gptq_dynamic_cfg.py
@@ -43,7 +43,5 @@ def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str):
             # All other layers (layer index >= 2) are not quantized
             assert isinstance(submodule, UnquantizedLinearMethod)
 
-    print(
-        vllm_model.generate_greedy(prompts=[PROMPT],
-                                   max_tokens=10)[0][1])
+    print(vllm_model.generate_greedy(prompts=[PROMPT], max_tokens=10)[0][1])
     del vllm_model

From 070ae3c53985fa4fdd0debcf7d4d24d5aba66961 Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Thu, 6 Feb 2025 17:29:14 +0800
Subject: [PATCH 32/56] rename method and add detailed comments

---
 .../layers/quantization/gptq_marlin.py        | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 51e2d5bfeb38..2044d6d5591a 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -58,7 +58,25 @@ def __init__(
             # (since we have only one group per output channel)
             desc_act = False
 
+        # GPTQModel use `dynamic_cfg` to allow per module quantization config so each module
+        # can be optmized for its own unique quant errors. Format is Dict[str, Dict] where key
+        # is a regex string that can both positive ("+:" prefixed) or negative ("-:" prefixed) match
+        # a module. Default to postiive match (override base quant config mode) if no prefix. 
+        # Value is in dict format of field key and override value. Negative matching will skip  
+        # quantization init for this module entirely (non-quantized inference).
+        # More details and quantize examples can be found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic_cfg = {
+        #  #`.*\.` matches the layers_node prefix
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,}, # positive match layer 10-15
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,}, # positive match layer 16-21
+        #  r"-:.*\.moe\..*": {}, # negative match all `moe` layers
+        #}
         self.dynamic_cfg = dynamic_cfg
+
+        
         self.weight_bits = weight_bits
         self.is_sym = is_sym
 
@@ -73,7 +91,8 @@ def __init__(
 
         self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
 
-    def update_config(self, prefix: str):
+    # match dynamic rules with module name (prefix) and apply quantize config overrides if matched
+    def override_config(self, prefix: str):
         bits = self.weight_bits
 
         b = self.get_dynamic_config(prefix, "bits", bits)
@@ -232,7 +251,7 @@ def __init__(self, quant_config: GPTQMarlinConfig, prefix: str) -> None:
         if len(self.quant_config.dynamic_cfg) > 0 and self.prefix:
             # gptqmodel per module/layer dynamic_cfg my override/change base
             # model quant config
-            self.quant_config.update_config(prefix=self.prefix)
+            self.quant_config.override_config(prefix=self.prefix)
 
         # Verify supported on platform.
         verify_marlin_supported(quant_type=self.quant_config.quant_type,

From 13b2b7ba4e8250c86f14624b69e7e9d07a0750d1 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 7 Feb 2025 02:54:31 +0000
Subject: [PATCH 33/56] Changed VocabParallelEmbedding.linear_method to
 quant_method to be consistent with LinearBase.

---
 tests/quantization/test_lm_head.py            |  4 ++--
 vllm/lora/layers.py                           |  2 +-
 .../model_executor/layers/logits_processor.py |  2 +-
 .../layers/quantization/gptq_marlin.py        | 10 ++++-----
 .../layers/vocab_parallel_embedding.py        | 22 +++++++++----------
 5 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index ec60d8a57559..2fe3d2f1a867 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -37,11 +37,11 @@ def check_model(model):
             lm_head_layer = model.lm_head
 
             if lm_head_quantized:
-                assert isinstance(lm_head_layer.linear_method,
+                assert isinstance(lm_head_layer.quant_method,
                                   (GPTQLinearMethod, GPTQMarlinLinearMethod,
                                    MarlinLinearMethod))
             else:
-                assert isinstance(lm_head_layer.linear_method,
+                assert isinstance(lm_head_layer.quant_method,
                                   UnquantizedEmbeddingMethod)
 
         vllm_model.apply_model(check_model)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 9f0297596ccb..04e5dc8752b6 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1040,7 +1040,7 @@ def _get_logits(
         embedding_bias: Optional[torch.Tensor] = None,
     ) -> Optional[torch.Tensor]:
         # Get the logits for the next tokens.
-        logits = lm_head.linear_method.apply(lm_head, hidden_states)
+        logits = lm_head.quant_method.apply(lm_head, hidden_states)
         if embedding_bias is not None:
             logits += embedding_bias
         logits = tensor_model_parallel_gather(logits)
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index cdc67ca83d48..a3d5e6da2683 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -95,7 +95,7 @@ def _get_logits(
         embedding_bias: Optional[torch.Tensor],
     ) -> Optional[torch.Tensor]:
         # Get the logits for the next tokens.
-        logits = lm_head.linear_method.apply(lm_head,
+        logits = lm_head.quant_method.apply(lm_head,
                                              hidden_states,
                                              bias=embedding_bias)
 
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 51e2d5bfeb38..97bc3dad001f 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -23,7 +23,7 @@
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_marlin_supported, marlin_moe_permute_scales,
     marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead, UnquantizedEmbeddingMethod
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
                                            PackedColumnParameter,
@@ -176,13 +176,13 @@ def get_dynamic_config(
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod",
-                        UnquantizedLinearMethod]]:
-        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
-                                             and self.lm_head_quantized):
+                        UnquantizedLinearMethod, UnquantizedEmbeddingMethod]]:
+        lm_head_quantized = isinstance(layer, ParallelLMHead) and self.lm_head_quantized
+        if isinstance(layer, LinearBase) or lm_head_quantized:
             if len(self.dynamic_cfg) > 0:
                 result = self.get_dynamic_config(layer_name=prefix)
                 if result is not None and not result:
-                    return UnquantizedLinearMethod()
+                    return UnquantizedEmbeddingMethod() if lm_head_quantized else UnquantizedLinearMethod()
 
             return GPTQMarlinLinearMethod(self, prefix=prefix)
         elif isinstance(layer, FusedMoE):
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index e409094dd535..85081a26c149 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -226,24 +226,24 @@ def __init__(self,
                                                self.tp_size)
         self.embedding_dim = embedding_dim
 
-        linear_method = None
+        quant_method = None
         if quant_config is not None:
-            linear_method = quant_config.get_quant_method(self, prefix=prefix)
-        if linear_method is None:
-            linear_method = UnquantizedEmbeddingMethod()
+            quant_method = quant_config.get_quant_method(self, prefix=prefix)
+        if quant_method is None:
+            quant_method = UnquantizedEmbeddingMethod()
 
         # If we are making an embedding layer, then our quantization linear
         # method must implement the embedding operation. If we are another
         # layer type like ParallelLMHead, this is not important.
         is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
-        linear_method_implements_embedding = method_has_implemented_embedding(
-            type(linear_method))
-        if is_embedding_layer and not linear_method_implements_embedding:
+        quant_method_implements_embedding = method_has_implemented_embedding(
+            type(quant_method))
+        if is_embedding_layer and not quant_method_implements_embedding:
             raise NotImplementedError(
-                f"The class {type(linear_method).__name__} must implement "
+                f"The class {type(quant_method).__name__} must implement "
                 "the 'embedding' method, see UnquantizedEmbeddingMethod.")
 
-        self.linear_method: QuantizeMethodBase = linear_method
+        self.quant_method: QuantizeMethodBase = quant_method
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -260,7 +260,7 @@ def __init__(self,
             self.shard_indices.added_vocab_end_index -
             self.shard_indices.added_vocab_start_index)
 
-        self.linear_method.create_weights(self,
+        self.quant_method.create_weights(self,
                                           self.embedding_dim,
                                           [self.num_embeddings_per_partition],
                                           self.embedding_dim,
@@ -412,7 +412,7 @@ def forward(self, input_):
         else:
             masked_input = input_
         # Get the embeddings.
-        output_parallel = self.linear_method.embedding(self,
+        output_parallel = self.quant_method.embedding(self,
                                                        masked_input.long())
         # Mask the output embedding.
         if self.tp_size > 1:

From 40562d16b2ea77381428b61ecd10e164572e39f1 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 7 Feb 2025 04:57:36 +0000
Subject: [PATCH 34/56] fix unittest

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 tests/quantization/test_gptq_dynamic_cfg.py   | 37 ++++++++++-------
 tests/quantization/test_lm_head.py            | 14 ++++---
 .../layers/quantization/gptq_marlin.py        | 40 +++++++++++--------
 3 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py
index abc48e786bd8..419eaae1c6cd 100644
--- a/tests/quantization/test_gptq_dynamic_cfg.py
+++ b/tests/quantization/test_gptq_dynamic_cfg.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Tests whether gptq models with dynamic_cfg quantized can be loaded.
 
 Run `pytest tests/quantization/test_gptq_dynamic_cfg.py --forked`.
@@ -15,7 +16,9 @@
 # The first layer is quantized using bits=4, group_size=128
 # The second layer is quantized using bits=8, group_size=32
 # All other layers (layer index >= 2) are not quantized
-MODEL_QUANT = ["ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg"]
+MODEL_QUANT = [
+    "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg-with-lm_head"
+]
 
 
 @pytest.mark.parametrize("model_id", MODEL_QUANT)
@@ -24,24 +27,30 @@ def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str):
 
     for name, submodule in (vllm_model.model.llm_engine.model_executor.
                             driver_worker.model_runner.model.named_modules()):
-        if name == 'model.model.layers.0.self_attn.qkv_proj':
+        if name == "lm_head":
+            assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod)
+        elif name == 'model.layers.0.self_attn.qkv_proj':
             # The first layer is quantized using bits=4, group_size=128
             # desc_act=True
-            assert isinstance(submodule, GPTQMarlinLinearMethod)
-            assert submodule.quant_config.bits == 4
-            assert submodule.quant_config.group_size == 128
-            assert submodule.quant_config.desc_act
-        elif name == 'model.model.layers.1.self_attn.qkv_proj':
+            assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod)
+            config = submodule.quant_method.quant_config
+            assert config.weight_bits == 4
+            assert config.group_size == 128
+            assert config.desc_act
+        elif name == 'model.layers.1.self_attn.qkv_proj':
             # The second layer is quantized using bits=8, group_size=32
             # desc_act=False
-            assert isinstance(submodule, GPTQMarlinLinearMethod)
-            assert submodule.quant_config.bits == 8
-            assert submodule.quant_config.group_size == 32
-            assert not submodule.quant_config.desc_act
-        elif (name == 'model.model.layers.2.self_attn.qkv_proj'
-              or name == 'model.model.layers.2.mlp.gate_up_proj'):
+            assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod)
+            config = submodule.quant_method.quant_config
+            assert config.get_dynamic_config(layer_name=name, key="bits") == 8
+            assert config.get_dynamic_config(layer_name=name,
+                                             key="group_size") == 32
+            assert not config.get_dynamic_config(layer_name=name,
+                                                 key="desc_act")
+        elif (name == 'model.layers.2.self_attn.qkv_proj'
+              or name == 'model.layers.2.mlp.gate_up_proj'):
             # All other layers (layer index >= 2) are not quantized
-            assert isinstance(submodule, UnquantizedLinearMethod)
+            assert isinstance(submodule.quant_method, UnquantizedLinearMethod)
 
     print(vllm_model.generate_greedy(prompts=[PROMPT], max_tokens=10)[0][1])
     del vllm_model
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index 2fe3d2f1a867..246eff2588b9 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -17,10 +17,13 @@
 
 PROMPT = "On the surface of Mars, we found"
 
-MODELS_QUANT = [(
-    "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse",
-    True), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
-                ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)]
+MODELS_QUANT = [
+    ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg-with-lm_head",
+     True),
+    ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False),
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
+    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)
+]
 
 
 @pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)
@@ -35,7 +38,8 @@ def test_lm_head(
 
         def check_model(model):
             lm_head_layer = model.lm_head
-
+            print("lm_head_layer.quant_method", model,
+                  lm_head_layer.quant_method)
             if lm_head_quantized:
                 assert isinstance(lm_head_layer.quant_method,
                                   (GPTQLinearMethod, GPTQMarlinLinearMethod,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 94ad1443fb75..746371814236 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -2,7 +2,6 @@
 
 import re
 from copy import deepcopy
-
 from typing import Any, Callable, Dict, List, Optional, Set, Union
 
 import torch
@@ -23,7 +22,8 @@
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_marlin_supported, marlin_moe_permute_scales,
     marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead, UnquantizedEmbeddingMethod
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, UnquantizedEmbeddingMethod)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
                                            PackedColumnParameter,
@@ -58,25 +58,28 @@ def __init__(
             # (since we have only one group per output channel)
             desc_act = False
 
-        # GPTQModel use `dynamic_cfg` to allow per module quantization config so each module
-        # can be optmized for its own unique quant errors. Format is Dict[str, Dict] where key
-        # is a regex string that can both positive ("+:" prefixed) or negative ("-:" prefixed) match
-        # a module. Default to postiive match (override base quant config mode) if no prefix. 
-        # Value is in dict format of field key and override value. Negative matching will skip  
-        # quantization init for this module entirely (non-quantized inference).
-        # More details and quantize examples can be found at: https://github.com/ModelCloud/GPTQModel
+        # GPTQModel use `dynamic_cfg` to allow per module quantization config
+        # so each module can be optimized for its own unique quant errors.
+        # Format is Dict[str, Dict] where key is a regex string that can both
+        # positive ("+:" prefixed) or negative ("-:" prefixed) match a module.
+        # Default to positive match (override base quant config mode) if no
+        # prefix. Value is in dict format of field key and override value.
+        # Negative matching will skip quantization init for this module entirely
+        # (non-quantized inference). More details and quantize examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
         # Example:
         #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
         #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
         # dynamic_cfg = {
         #  #`.*\.` matches the layers_node prefix
-        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,}, # positive match layer 10-15
-        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,}, # positive match layer 16-21
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
         #  r"-:.*\.moe\..*": {}, # negative match all `moe` layers
-        #}
+        # }
         self.dynamic_cfg = dynamic_cfg
 
-        
         self.weight_bits = weight_bits
         self.is_sym = is_sym
 
@@ -91,7 +94,8 @@ def __init__(
 
         self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
 
-    # match dynamic rules with module name (prefix) and apply quantize config overrides if matched
+    # match dynamic rules with module name (prefix) and apply to quantize
+    # config overrides if matched
     def override_config(self, prefix: str):
         bits = self.weight_bits
 
@@ -142,6 +146,8 @@ def get_config_filenames(cls) -> List[str]:
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
         dynamic_cfg = cls.get_from_keys_or(config, ["dynamic"], default={})
+        if dynamic_cfg is None:
+            dynamic_cfg = {}
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
@@ -196,12 +202,14 @@ def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod",
                         UnquantizedLinearMethod, UnquantizedEmbeddingMethod]]:
-        lm_head_quantized = isinstance(layer, ParallelLMHead) and self.lm_head_quantized
+        lm_head_quantized = isinstance(
+            layer, ParallelLMHead) and self.lm_head_quantized
         if isinstance(layer, LinearBase) or lm_head_quantized:
             if len(self.dynamic_cfg) > 0:
                 result = self.get_dynamic_config(layer_name=prefix)
                 if result is not None and not result:
-                    return UnquantizedEmbeddingMethod() if lm_head_quantized else UnquantizedLinearMethod()
+                    return UnquantizedEmbeddingMethod(
+                    ) if lm_head_quantized else UnquantizedLinearMethod()
 
             return GPTQMarlinLinearMethod(self, prefix=prefix)
         elif isinstance(layer, FusedMoE):

From 7b774bbbd7044f7e2e2146b8764816f51394309c Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 7 Feb 2025 04:59:13 +0000
Subject: [PATCH 35/56] cleanup

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 tests/quantization/test_lm_head.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index 246eff2588b9..812d3d6825a6 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -38,8 +38,6 @@ def test_lm_head(
 
         def check_model(model):
             lm_head_layer = model.lm_head
-            print("lm_head_layer.quant_method", model,
-                  lm_head_layer.quant_method)
             if lm_head_quantized:
                 assert isinstance(lm_head_layer.quant_method,
                                   (GPTQLinearMethod, GPTQMarlinLinearMethod,

From c72125a3f8f4652f7f8792eee3a7eeb825912cd9 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 7 Feb 2025 06:54:08 +0000
Subject: [PATCH 36/56] cleanup

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 ...tq_dynamic_cfg.py => test_gptq_dynamic.py} | 16 ++++----
 .../layers/quantization/gptq_marlin.py        | 40 +++++++++----------
 2 files changed, 28 insertions(+), 28 deletions(-)
 rename tests/quantization/{test_gptq_dynamic_cfg.py => test_gptq_dynamic.py} (78%)

diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic.py
similarity index 78%
rename from tests/quantization/test_gptq_dynamic_cfg.py
rename to tests/quantization/test_gptq_dynamic.py
index 419eaae1c6cd..6c177e1be0f4 100644
--- a/tests/quantization/test_gptq_dynamic_cfg.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Tests whether gptq models with dynamic_cfg quantized can be loaded.
+"""Tests whether gptq models with dynamic quantized can be loaded.
 
-Run `pytest tests/quantization/test_gptq_dynamic_cfg.py --forked`.
+Run `pytest tests/quantization/test_gptq_dynamic.py --forked`.
 """
 
 import pytest
@@ -22,7 +22,7 @@
 
 
 @pytest.mark.parametrize("model_id", MODEL_QUANT)
-def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str):
+def test_gptq_with_dynamic(vllm_runner, model_id: str):
     vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
 
     for name, submodule in (vllm_model.model.llm_engine.model_executor.
@@ -42,11 +42,11 @@ def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str):
             # desc_act=False
             assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod)
             config = submodule.quant_method.quant_config
-            assert config.get_dynamic_config(layer_name=name, key="bits") == 8
-            assert config.get_dynamic_config(layer_name=name,
-                                             key="group_size") == 32
-            assert not config.get_dynamic_config(layer_name=name,
-                                                 key="desc_act")
+            assert config.get_dynamic_value(layer_name=name, key="bits") == 8
+            assert config.get_dynamic_value(layer_name=name,
+                                            key="group_size") == 32
+            assert not config.get_dynamic_value(layer_name=name,
+                                                key="desc_act")
         elif (name == 'model.layers.2.self_attn.qkv_proj'
               or name == 'model.layers.2.mlp.gate_up_proj'):
             # All other layers (layer index >= 2) are not quantized
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 746371814236..2b24bdbae2f7 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -51,14 +51,14 @@ def __init__(
         desc_act: bool,
         is_sym: bool,
         lm_head_quantized: bool,
-        dynamic_cfg: Dict[str, Dict[str, Union[int, bool]]],
+        dynamic: Dict[str, Dict[str, Union[int, bool]]],
     ) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
             desc_act = False
 
-        # GPTQModel use `dynamic_cfg` to allow per module quantization config
+        # GPTQModel use `dynamic` to allow per module quantization config
         # so each module can be optimized for its own unique quant errors.
         # Format is Dict[str, Dict] where key is a regex string that can both
         # positive ("+:" prefixed) or negative ("-:" prefixed) match a module.
@@ -70,7 +70,7 @@ def __init__(
         # Example:
         #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
         #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
-        # dynamic_cfg = {
+        # dynamic = {
         #  #`.*\.` matches the layers_node prefix
         #  # positive match layer 10-15
         #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
@@ -78,7 +78,7 @@ def __init__(
         #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
         #  r"-:.*\.moe\..*": {}, # negative match all `moe` layers
         # }
-        self.dynamic_cfg = dynamic_cfg
+        self.dynamic = dynamic
 
         self.weight_bits = weight_bits
         self.is_sym = is_sym
@@ -99,17 +99,17 @@ def __init__(
     def override_config(self, prefix: str):
         bits = self.weight_bits
 
-        b = self.get_dynamic_config(prefix, "bits", bits)
+        b = self.get_dynamic_value(prefix, "bits", bits)
         if isinstance(b, int):
             bits = b
-        group_size = self.get_dynamic_config(prefix, "group_size",
-                                             self.group_size)
+        group_size = self.get_dynamic_value(prefix, "group_size",
+                                            self.group_size)
         if isinstance(group_size, int):
             self.group_size = group_size
-        desc_act = self.get_dynamic_config(prefix, "desc_act", self.desc_act)
+        desc_act = self.get_dynamic_value(prefix, "desc_act", self.desc_act)
         if isinstance(desc_act, bool):
             self.desc_act = desc_act
-        is_sym = self.get_dynamic_config(prefix, "sym", self.is_sym)
+        is_sym = self.get_dynamic_value(prefix, "sym", self.is_sym)
         if isinstance(is_sym, bool):
             self.is_sym = is_sym
 
@@ -125,7 +125,7 @@ def __repr__(self) -> str:
                 f"group_size={self.group_size}, "
                 f"desc_act={self.desc_act}, "
                 f"lm_head_quantized={self.lm_head_quantized}), "
-                f"dynamic_cfg={self.dynamic_cfg}")
+                f"dynamic={self.dynamic}")
 
     @classmethod
     def get_name(cls) -> str:
@@ -145,9 +145,9 @@ def get_config_filenames(cls) -> List[str]:
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
-        dynamic_cfg = cls.get_from_keys_or(config, ["dynamic"], default={})
-        if dynamic_cfg is None:
-            dynamic_cfg = {}
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        if dynamic is None:
+            dynamic = {}
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
@@ -155,7 +155,7 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
                                                  default=False)
         return cls(weight_bits, group_size, desc_act, is_sym,
-                   lm_head_quantized, dynamic_cfg)
+                   lm_head_quantized, dynamic)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
@@ -178,13 +178,13 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def get_dynamic_config(
+    def get_dynamic_value(
         self,
         layer_name: str,
         key: Optional[str] = None,
         default_value: Union[int, bool, None] = None
     ) -> Union[Dict, int, bool, None]:
-        for pattern, pattern_dict in self.dynamic_cfg.items():
+        for pattern, pattern_dict in self.dynamic.items():
             # negative match: matched modules are excluded from quantization
             if pattern.startswith("-:"):
                 if re.match(pattern.removeprefix("-:"), layer_name):
@@ -205,8 +205,8 @@ def get_quant_method(
         lm_head_quantized = isinstance(
             layer, ParallelLMHead) and self.lm_head_quantized
         if isinstance(layer, LinearBase) or lm_head_quantized:
-            if len(self.dynamic_cfg) > 0:
-                result = self.get_dynamic_config(layer_name=prefix)
+            if len(self.dynamic) > 0:
+                result = self.get_dynamic_value(layer_name=prefix)
                 if result is not None and not result:
                     return UnquantizedEmbeddingMethod(
                     ) if lm_head_quantized else UnquantizedLinearMethod()
@@ -256,8 +256,8 @@ def __init__(self, quant_config: GPTQMarlinConfig, prefix: str) -> None:
         self.quant_config = deepcopy(quant_config)
         self.prefix = prefix
 
-        if len(self.quant_config.dynamic_cfg) > 0 and self.prefix:
-            # gptqmodel per module/layer dynamic_cfg my override/change base
+        if len(self.quant_config.dynamic) > 0 and self.prefix:
+            # gptqmodel per module/layer dynamic my override/change base
             # model quant config
             self.quant_config.override_config(prefix=self.prefix)
 

From c298195414c0f9307557c73120b1209871aac251 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 7 Feb 2025 07:18:59 +0000
Subject: [PATCH 37/56] cleanup

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 tests/quantization/test_gptq_dynamic.py            | 11 ++++++-----
 .../layers/quantization/gptq_marlin.py             | 14 +++++++-------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index 6c177e1be0f4..2262a51a5b33 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -42,11 +42,12 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str):
             # desc_act=False
             assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod)
             config = submodule.quant_method.quant_config
-            assert config.get_dynamic_value(layer_name=name, key="bits") == 8
-            assert config.get_dynamic_value(layer_name=name,
-                                            key="group_size") == 32
-            assert not config.get_dynamic_value(layer_name=name,
-                                                key="desc_act")
+            assert config.get_dynamic_override(layer_name=name,
+                                               key="bits") == 8
+            assert config.get_dynamic_override(layer_name=name,
+                                               key="group_size") == 32
+            assert not config.get_dynamic_override(layer_name=name,
+                                                   key="desc_act")
         elif (name == 'model.layers.2.self_attn.qkv_proj'
               or name == 'model.layers.2.mlp.gate_up_proj'):
             # All other layers (layer index >= 2) are not quantized
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 2b24bdbae2f7..af707d0869ca 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -99,17 +99,17 @@ def __init__(
     def override_config(self, prefix: str):
         bits = self.weight_bits
 
-        b = self.get_dynamic_value(prefix, "bits", bits)
+        b = self.get_dynamic_override(prefix, "bits", bits)
         if isinstance(b, int):
             bits = b
-        group_size = self.get_dynamic_value(prefix, "group_size",
-                                            self.group_size)
+        group_size = self.get_dynamic_override(prefix, "group_size",
+                                               self.group_size)
         if isinstance(group_size, int):
             self.group_size = group_size
-        desc_act = self.get_dynamic_value(prefix, "desc_act", self.desc_act)
+        desc_act = self.get_dynamic_override(prefix, "desc_act", self.desc_act)
         if isinstance(desc_act, bool):
             self.desc_act = desc_act
-        is_sym = self.get_dynamic_value(prefix, "sym", self.is_sym)
+        is_sym = self.get_dynamic_override(prefix, "sym", self.is_sym)
         if isinstance(is_sym, bool):
             self.is_sym = is_sym
 
@@ -178,7 +178,7 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def get_dynamic_value(
+    def get_dynamic_override(
         self,
         layer_name: str,
         key: Optional[str] = None,
@@ -206,7 +206,7 @@ def get_quant_method(
             layer, ParallelLMHead) and self.lm_head_quantized
         if isinstance(layer, LinearBase) or lm_head_quantized:
             if len(self.dynamic) > 0:
-                result = self.get_dynamic_value(layer_name=prefix)
+                result = self.get_dynamic_override(layer_name=prefix)
                 if result is not None and not result:
                     return UnquantizedEmbeddingMethod(
                     ) if lm_head_quantized else UnquantizedLinearMethod()

From bbc049dbce6160ccdff1e3ebe3f90ad3f415e3a9 Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Fri, 7 Feb 2025 15:38:42 +0800
Subject: [PATCH 38/56] Update gptq_marlin.py

---
 .../layers/quantization/gptq_marlin.py        | 32 +++++++++----------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index af707d0869ca..93e46fd63e5a 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -58,14 +58,14 @@ def __init__(
             # (since we have only one group per output channel)
             desc_act = False
 
-        # GPTQModel use `dynamic` to allow per module quantization config
-        # so each module can be optimized for its own unique quant errors.
-        # Format is Dict[str, Dict] where key is a regex string that can both
-        # positive ("+:" prefixed) or negative ("-:" prefixed) match a module.
-        # Default to positive match (override base quant config mode) if no
-        # prefix. Value is in dict format of field key and override value.
-        # Negative matching will skip quantization init for this module entirely
-        # (non-quantized inference). More details and quantize examples can be
+        # GPTQModel use `dynamic` config property to allow per module quantization 
+        # config so each module can be individually optimized.
+        # Format is Dict[str, Dict] where key is a regex string that can perform both
+        # positive ("+:" prefixed) or negative ("-:" prefixed) matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override value.
+        # Negative matching will skip quantization init for this module entirely:
+        # non-quantized inference. More details and quantization examples can be
         # found at: https://github.com/ModelCloud/GPTQModel
         # Example:
         #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
@@ -76,7 +76,7 @@ def __init__(
         #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
         #  # positive match layer 16-21
         #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
-        #  r"-:.*\.moe\..*": {}, # negative match all `moe` layers
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
         # }
         self.dynamic = dynamic
 
@@ -94,8 +94,8 @@ def __init__(
 
         self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
 
-    # match dynamic rules with module name (prefix) and apply to quantize
-    # config overrides if matched
+    # Match dynamic rules with module name (prefix) and override quantize
+    # config if module (prefix) matches a rule
     def override_config(self, prefix: str):
         bits = self.weight_bits
 
@@ -185,11 +185,11 @@ def get_dynamic_override(
         default_value: Union[int, bool, None] = None
     ) -> Union[Dict, int, bool, None]:
         for pattern, pattern_dict in self.dynamic.items():
-            # negative match: matched modules are excluded from quantization
+            # Negative match: matched modules are excluded from quantized init
             if pattern.startswith("-:"):
                 if re.match(pattern.removeprefix("-:"), layer_name):
                     return False
-            # positive match: matched modules have quant properties overriding
+            # Positive match: matched modules have quant properties overrides
             # base quant config
             elif re.match(pattern.removeprefix("+:"), layer_name):
                 if key is None:
@@ -218,7 +218,6 @@ def get_quant_method(
 
     @classmethod
     def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
-        # Extract data from quant config.
         quant_method = quant_config.get("quant_method", "").lower()
         num_bits = quant_config.get("bits")
         group_size = quant_config.get("group_size")
@@ -231,7 +230,7 @@ def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
         if quant_method != "gptq":
             return False
 
-        # If we cannot find the info needed in the config, cannot convert.
+        # Marlin conversion is only valid if required properties are found
         if (num_bits is None or group_size is None or sym is None
                 or desc_act is None):
             return False
@@ -257,8 +256,7 @@ def __init__(self, quant_config: GPTQMarlinConfig, prefix: str) -> None:
         self.prefix = prefix
 
         if len(self.quant_config.dynamic) > 0 and self.prefix:
-            # gptqmodel per module/layer dynamic my override/change base
-            # model quant config
+            # Dynamic per module/layer rules may override base config
             self.quant_config.override_config(prefix=self.prefix)
 
         # Verify supported on platform.

From 78f88183345dcc05e5078def6250de7018642f22 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 7 Feb 2025 07:41:43 +0000
Subject: [PATCH 39/56] format

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 .../layers/quantization/gptq_marlin.py            | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 93e46fd63e5a..61c8d9011f0d 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -58,13 +58,16 @@ def __init__(
             # (since we have only one group per output channel)
             desc_act = False
 
-        # GPTQModel use `dynamic` config property to allow per module quantization 
-        # config so each module can be individually optimized.
-        # Format is Dict[str, Dict] where key is a regex string that can perform both
-        # positive ("+:" prefixed) or negative ("-:" prefixed) matching of a module.
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is Dict[str, Dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
         # Default to positive match, override base quant config mode, if no
-        # prefix is used. Value is in dict format of field key and override value.
-        # Negative matching will skip quantization init for this module entirely:
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
         # non-quantized inference. More details and quantization examples can be
         # found at: https://github.com/ModelCloud/GPTQModel
         # Example:

From 93ee5762186ee1f3e92fd250e275d168a87ca4ff Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Fri, 7 Feb 2025 23:23:07 +0800
Subject: [PATCH 40/56] Update gptq_marlin.py

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 3a616d6f307f..36d4635db956 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -210,7 +210,8 @@ def get_quant_method(
         if isinstance(layer, LinearBase) or lm_head_quantized:
             if len(self.dynamic) > 0:
                 result = self.get_dynamic_override(layer_name=prefix)
-                if result is not None and not result:
+                # False = skip module, None = no override, else = Positive match
+                if result == False: 
                     return UnquantizedEmbeddingMethod(
                     ) if lm_head_quantized else UnquantizedLinearMethod()
 

From 6ebf85cbfc56b2f8fe5110b4de00818f1a71f47b Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Fri, 7 Feb 2025 23:36:04 +0800
Subject: [PATCH 41/56] rename to parallel_lm_head_quantized for clarity

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 36d4635db956..1b5003c1aed8 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -205,15 +205,15 @@ def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod",
                         UnquantizedLinearMethod, UnquantizedEmbeddingMethod]]:
-        lm_head_quantized = isinstance(
+        parallel_lm_head_quantized = isinstance(
             layer, ParallelLMHead) and self.lm_head_quantized
-        if isinstance(layer, LinearBase) or lm_head_quantized:
+        if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
             if len(self.dynamic) > 0:
                 result = self.get_dynamic_override(layer_name=prefix)
                 # False = skip module, None = no override, else = Positive match
                 if result == False: 
                     return UnquantizedEmbeddingMethod(
-                    ) if lm_head_quantized else UnquantizedLinearMethod()
+                    ) if parallel_lm_head_quantized else UnquantizedLinearMethod()
 
             return GPTQMarlinLinearMethod(self, prefix=prefix)
         elif isinstance(layer, FusedMoE):

From 59bdf54c2ee2c314fb28049fcccc7877fc602c0f Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Fri, 7 Feb 2025 23:38:31 +0800
Subject: [PATCH 42/56] simplify

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 1b5003c1aed8..11e65e6ced2b 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -208,12 +208,10 @@ def get_quant_method(
         parallel_lm_head_quantized = isinstance(
             layer, ParallelLMHead) and self.lm_head_quantized
         if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
-            if len(self.dynamic) > 0:
-                result = self.get_dynamic_override(layer_name=prefix)
-                # False = skip module, None = no override, else = Positive match
-                if result == False: 
-                    return UnquantizedEmbeddingMethod(
-                    ) if parallel_lm_head_quantized else UnquantizedLinearMethod()
+            # False = skip module, None = no override, else = Positive match
+            if self.get_dynamic_override(layer_name=prefix) == False: 
+                return UnquantizedEmbeddingMethod(
+                ) if parallel_lm_head_quantized else UnquantizedLinearMethod()
 
             return GPTQMarlinLinearMethod(self, prefix=prefix)
         elif isinstance(layer, FusedMoE):

From 9de0382159e6e18a59a2b243f71b08a9224b4870 Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Fri, 7 Feb 2025 23:42:48 +0800
Subject: [PATCH 43/56] shorten code

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 11e65e6ced2b..22bd9c0a63b8 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -149,8 +149,7 @@ def get_config_filenames(cls) -> List[str]:
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
         dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
-        if dynamic is None:
-            dynamic = {}
+        dynamic = {} if dynamic is None else dynamic
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])

From 67d0882ff4522a2e3100d7a76ec052e095a97adb Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 7 Feb 2025 16:09:31 +0000
Subject: [PATCH 44/56] cleanup

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 .../layers/quantization/gptq_marlin.py        | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 22bd9c0a63b8..d3a7253c7686 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -208,11 +208,17 @@ def get_quant_method(
             layer, ParallelLMHead) and self.lm_head_quantized
         if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
             # False = skip module, None = no override, else = Positive match
-            if self.get_dynamic_override(layer_name=prefix) == False: 
+            if self.get_dynamic_override(layer_name=prefix) == False:
                 return UnquantizedEmbeddingMethod(
                 ) if parallel_lm_head_quantized else UnquantizedLinearMethod()
 
-            return GPTQMarlinLinearMethod(self, prefix=prefix)
+            quant_config = deepcopy(self)
+
+            if len(quant_config.dynamic) > 0 and prefix:
+                # Dynamic per module/layer rules may override base config
+                quant_config.override_config(prefix=prefix)
+
+            return GPTQMarlinLinearMethod(quant_config)
         elif isinstance(layer, FusedMoE):
             return GPTQMarlinMoEMethod(self)
         return None
@@ -252,13 +258,8 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
 
     _kernel_backends_being_used: Set[str] = set()
 
-    def __init__(self, quant_config: GPTQMarlinConfig, prefix: str) -> None:
-        self.quant_config = deepcopy(quant_config)
-        self.prefix = prefix
-
-        if len(self.quant_config.dynamic) > 0 and self.prefix:
-            # Dynamic per module/layer rules may override base config
-            self.quant_config.override_config(prefix=self.prefix)
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
 
         # Verify supported on platform.
         verify_marlin_supported(quant_type=self.quant_config.quant_type,

From 5623936795422e82254b01e1f799eec4cc7a9300 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Fri, 7 Feb 2025 16:13:22 +0000
Subject: [PATCH 45/56] cleanup

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index d3a7253c7686..6f1be14c9be6 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -214,7 +214,7 @@ def get_quant_method(
 
             quant_config = deepcopy(self)
 
-            if len(quant_config.dynamic) > 0 and prefix:
+            if prefix:
                 # Dynamic per module/layer rules may override base config
                 quant_config.override_config(prefix=prefix)
 

From e41bdd760b9aa5948c20fa69c5bbd9a2a03c69ae Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Sat, 8 Feb 2025 00:14:54 +0800
Subject: [PATCH 46/56] make lint pass

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 6f1be14c9be6..d910587eb3b5 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -208,7 +208,7 @@ def get_quant_method(
             layer, ParallelLMHead) and self.lm_head_quantized
         if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
             # False = skip module, None = no override, else = Positive match
-            if self.get_dynamic_override(layer_name=prefix) == False:
+            if self.get_dynamic_override(layer_name=prefix) == False: # noqa: E712
                 return UnquantizedEmbeddingMethod(
                 ) if parallel_lm_head_quantized else UnquantizedLinearMethod()
 

From 965d7daa1ba320b3389f57eea10ab7d68e82eec9 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 11 Feb 2025 02:26:01 +0000
Subject: [PATCH 47/56] change model_id

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 tests/quantization/test_gptq_dynamic.py | 2 +-
 tests/quantization/test_lm_head.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index 2262a51a5b33..37db57dc8d98 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -17,7 +17,7 @@
 # The second layer is quantized using bits=8, group_size=32
 # All other layers (layer index >= 2) are not quantized
 MODEL_QUANT = [
-    "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg-with-lm_head"
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head"
 ]
 
 
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index 812d3d6825a6..7343e56099ad 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -18,7 +18,7 @@
 PROMPT = "On the surface of Mars, we found"
 
 MODELS_QUANT = [
-    ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg-with-lm_head",
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
      True),
     ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False),
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),

From 1a34027aedb408329f85a3d3f6c3d7b375c179de Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 11 Feb 2025 03:07:13 +0000
Subject: [PATCH 48/56] format

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 tests/quantization/test_lm_head.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index 7343e56099ad..3bbf1f0e4765 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -18,8 +18,7 @@
 PROMPT = "On the surface of Mars, we found"
 
 MODELS_QUANT = [
-    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
-     True),
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True),
     ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False),
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
     ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)

From 0b249a1f9e37c07e3b4ac9cc84a0a1383abb3dd3 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 11 Feb 2025 03:19:53 +0000
Subject: [PATCH 49/56] format code

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 .../layers/vocab_parallel_embedding.py             | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 85081a26c149..f65dfc3cb329 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -261,12 +261,12 @@ def __init__(self,
             self.shard_indices.added_vocab_start_index)
 
         self.quant_method.create_weights(self,
-                                          self.embedding_dim,
-                                          [self.num_embeddings_per_partition],
-                                          self.embedding_dim,
-                                          self.num_embeddings_padded,
-                                          params_dtype=params_dtype,
-                                          weight_loader=self.weight_loader)
+                                         self.embedding_dim,
+                                         [self.num_embeddings_per_partition],
+                                         self.embedding_dim,
+                                         self.num_embeddings_padded,
+                                         params_dtype=params_dtype,
+                                         weight_loader=self.weight_loader)
 
     @classmethod
     def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int,
@@ -413,7 +413,7 @@ def forward(self, input_):
             masked_input = input_
         # Get the embeddings.
         output_parallel = self.quant_method.embedding(self,
-                                                       masked_input.long())
+                                                      masked_input.long())
         # Mask the output embedding.
         if self.tp_size > 1:
             output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)

From 4de04ae40bd9473d7039d365f8f55533cdaeff10 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 11 Feb 2025 03:39:53 +0000
Subject: [PATCH 50/56] format code

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 vllm/model_executor/layers/logits_processor.py         | 4 ++--
 vllm/model_executor/layers/quantization/gptq_marlin.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 7c907242de3e..9b1742998578 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -109,8 +109,8 @@ def _get_logits(
     ) -> Optional[torch.Tensor]:
         # Get the logits for the next tokens.
         logits = lm_head.quant_method.apply(lm_head,
-                                             hidden_states,
-                                             bias=embedding_bias)
+                                            hidden_states,
+                                            bias=embedding_bias)
 
         # Gather logits for TP
         logits = self._gather_logits(logits)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index d910587eb3b5..649d09c62877 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -208,7 +208,8 @@ def get_quant_method(
             layer, ParallelLMHead) and self.lm_head_quantized
         if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
             # False = skip module, None = no override, else = Positive match
-            if self.get_dynamic_override(layer_name=prefix) == False: # noqa: E712
+             if self.get_dynamic_override(
+                     layer_name=prefix) == False:  # noqa: E712
                 return UnquantizedEmbeddingMethod(
                 ) if parallel_lm_head_quantized else UnquantizedLinearMethod()
 

From 4c0608b3d9ec61df6d1e8f5df97d58ec08cbc75f Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 11 Feb 2025 03:42:33 +0000
Subject: [PATCH 51/56] format code

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 649d09c62877..76b5e81cfa6b 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -208,10 +208,11 @@ def get_quant_method(
             layer, ParallelLMHead) and self.lm_head_quantized
         if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
             # False = skip module, None = no override, else = Positive match
-             if self.get_dynamic_override(
-                     layer_name=prefix) == False:  # noqa: E712
-                return UnquantizedEmbeddingMethod(
-                ) if parallel_lm_head_quantized else UnquantizedLinearMethod()
+            if self.get_dynamic_override(
+                    layer_name=prefix) == False:  # noqa: E712
+                if parallel_lm_head_quantized:
+                    return UnquantizedEmbeddingMethod()
+                return UnquantizedLinearMethod()
 
             quant_config = deepcopy(self)
 

From 8f2137547ef1cb4abf1524e21b05000670ec9e6c Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 11 Feb 2025 03:47:50 +0000
Subject: [PATCH 52/56] disable E712 ruff check

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 76b5e81cfa6b..e0060f74a9ac 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -208,7 +208,7 @@ def get_quant_method(
             layer, ParallelLMHead) and self.lm_head_quantized
         if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
             # False = skip module, None = no override, else = Positive match
-            if self.get_dynamic_override(
+            if self.get_dynamic_override(  # noqa: E712
                     layer_name=prefix) == False:  # noqa: E712
                 if parallel_lm_head_quantized:
                     return UnquantizedEmbeddingMethod()

From e3084e3586086b069661ad431296540bf9c67204 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 11 Feb 2025 08:28:04 +0000
Subject: [PATCH 53/56] Extract code to gptq_utils.get_linear_quant_method()

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 tests/quantization/test_gptq_dynamic.py       | 30 +++---
 .../layers/quantization/gptq.py               | 47 +++++++--
 .../layers/quantization/gptq_marlin.py        | 77 ++-------------
 .../layers/quantization/utils/gptq_utils.py   | 98 +++++++++++++++++++
 4 files changed, 163 insertions(+), 89 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/utils/gptq_utils.py

diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index 37db57dc8d98..88882a5bdbdd 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -8,8 +8,11 @@
 import torch
 
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.utils.gptq_utils import (
+    get_dynamic_override)
 
 PROMPT = "On the surface of Mars, we found"
 
@@ -17,7 +20,8 @@
 # The second layer is quantized using bits=8, group_size=32
 # All other layers (layer index >= 2) are not quantized
 MODEL_QUANT = [
-    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head"
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
 ]
 
 
@@ -25,14 +29,18 @@
 def test_gptq_with_dynamic(vllm_runner, model_id: str):
     vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
 
+    use_marlin_kernel = "symTrue" in model_id
+    linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
+        GPTQLinearMethod)
+
     for name, submodule in (vllm_model.model.llm_engine.model_executor.
                             driver_worker.model_runner.model.named_modules()):
         if name == "lm_head":
-            assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod)
+            assert isinstance(submodule.quant_method, linear_method_cls)
         elif name == 'model.layers.0.self_attn.qkv_proj':
             # The first layer is quantized using bits=4, group_size=128
             # desc_act=True
-            assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod)
+            assert isinstance(submodule.quant_method, linear_method_cls)
             config = submodule.quant_method.quant_config
             assert config.weight_bits == 4
             assert config.group_size == 128
@@ -40,18 +48,18 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str):
         elif name == 'model.layers.1.self_attn.qkv_proj':
             # The second layer is quantized using bits=8, group_size=32
             # desc_act=False
-            assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod)
+            assert isinstance(submodule.quant_method, linear_method_cls)
             config = submodule.quant_method.quant_config
-            assert config.get_dynamic_override(layer_name=name,
-                                               key="bits") == 8
-            assert config.get_dynamic_override(layer_name=name,
-                                               key="group_size") == 32
-            assert not config.get_dynamic_override(layer_name=name,
-                                                   key="desc_act")
+            assert get_dynamic_override(config, layer_name=name,
+                                        key="bits") == 8
+            assert get_dynamic_override(config,
+                                        layer_name=name,
+                                        key="group_size") == 32
+            assert not get_dynamic_override(
+                config, layer_name=name, key="desc_act")
         elif (name == 'model.layers.2.self_attn.qkv_proj'
               or name == 'model.layers.2.mlp.gate_up_proj'):
             # All other layers (layer index >= 2) are not quantized
             assert isinstance(submodule.quant_method, UnquantizedLinearMethod)
 
-    print(vllm_model.generate_greedy(prompts=[PROMPT], max_tokens=10)[0][1])
     del vllm_model
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 0cb77a7546d1..6d1f0cc2eb4d 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -3,16 +3,17 @@
 import enum
 from enum import Enum
 from fractions import Fraction
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.linear import LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.layers.quantization.utils.gptq_utils import (
+    get_linear_quant_method)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
                                            PackedColumnParameter,
@@ -32,7 +33,33 @@ def __init__(
         group_size: int,
         desc_act: bool,
         lm_head_quantized: bool,
+        dynamic: Dict[str, Dict[str, Union[int, bool]]],
     ) -> None:
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is Dict[str, Dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        self.dynamic = dynamic
+
         self.weight_bits = weight_bits
         self.group_size = group_size
         self.desc_act = desc_act
@@ -47,7 +74,8 @@ def __repr__(self) -> str:
         return (f"GPTQConfig(weight_bits={self.weight_bits}, "
                 f"group_size={self.group_size}, "
                 f"desc_act={self.desc_act}),"
-                f"lm_head_quantized={self.lm_head_quantized}")
+                f"lm_head_quantized={self.lm_head_quantized}), "
+                f"dynamic={self.dynamic}")
 
     @classmethod
     def get_name(cls) -> str:
@@ -68,19 +96,20 @@ def get_config_filenames(cls) -> List[str]:
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig":
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        dynamic = {} if dynamic is None else dynamic
+
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
                                                  default=False)
-        return cls(weight_bits, group_size, desc_act, lm_head_quantized)
+        return cls(weight_bits, group_size, desc_act, lm_head_quantized,
+                   dynamic)
 
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["GPTQLinearMethod"]:
-        if (isinstance(layer, LinearBase) or
-            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
-            return GPTQLinearMethod(self)
-        return None
+        return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod)
 
 
 class ExllamaState(Enum):
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index e0060f74a9ac..0a9d86b008db 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import re
-from copy import deepcopy
 from typing import Any, Callable, Dict, List, Optional, Set, Union
 
 import torch
@@ -11,7 +9,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                UnquantizedLinearMethod,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -19,11 +17,13 @@
 from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.gptq_utils import (
+    get_linear_quant_method)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_marlin_supported, marlin_moe_permute_scales,
     marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, UnquantizedEmbeddingMethod)
+    UnquantizedEmbeddingMethod)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
                                            PackedColumnParameter,
@@ -97,32 +97,6 @@ def __init__(
 
         self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
 
-    # Match dynamic rules with module name (prefix) and override quantize
-    # config if module (prefix) matches a rule
-    def override_config(self, prefix: str):
-        bits = self.weight_bits
-
-        b = self.get_dynamic_override(prefix, "bits", bits)
-        if isinstance(b, int):
-            bits = b
-        group_size = self.get_dynamic_override(prefix, "group_size",
-                                               self.group_size)
-        if isinstance(group_size, int):
-            self.group_size = group_size
-        desc_act = self.get_dynamic_override(prefix, "desc_act", self.desc_act)
-        if isinstance(desc_act, bool):
-            self.desc_act = desc_act
-        is_sym = self.get_dynamic_override(prefix, "sym", self.is_sym)
-        if isinstance(is_sym, bool):
-            self.is_sym = is_sym
-
-        self.pack_factor = 32 // bits  # packed into int32
-        if (bits, self.is_sym) not in self.TYPE_MAP:
-            raise ValueError("Unsupported quantization config: "
-                             f"bits={bits}, sym={self.is_sym}")
-
-        self.quant_type = self.TYPE_MAP[(bits, self.is_sym)]
-
     def __repr__(self) -> str:
         return (f"GPTQMarlinConfig(quant_type={self.quant_type}, "
                 f"group_size={self.group_size}, "
@@ -150,6 +124,7 @@ def get_config_filenames(cls) -> List[str]:
     def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
         dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
         dynamic = {} if dynamic is None else dynamic
+
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
@@ -180,50 +155,14 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def get_dynamic_override(
-        self,
-        layer_name: str,
-        key: Optional[str] = None,
-        default_value: Union[int, bool, None] = None
-    ) -> Union[Dict, int, bool, None]:
-        for pattern, pattern_dict in self.dynamic.items():
-            # Negative match: matched modules are excluded from quantized init
-            if pattern.startswith("-:"):
-                if re.match(pattern.removeprefix("-:"), layer_name):
-                    return False
-            # Positive match: matched modules have quant properties overrides
-            # base quant config
-            elif re.match(pattern.removeprefix("+:"), layer_name):
-                if key is None:
-                    return pattern_dict
-                else:
-                    return pattern_dict.get(key, default_value)
-        return default_value
-
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod",
                         UnquantizedLinearMethod, UnquantizedEmbeddingMethod]]:
-        parallel_lm_head_quantized = isinstance(
-            layer, ParallelLMHead) and self.lm_head_quantized
-        if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
-            # False = skip module, None = no override, else = Positive match
-            if self.get_dynamic_override(  # noqa: E712
-                    layer_name=prefix) == False:  # noqa: E712
-                if parallel_lm_head_quantized:
-                    return UnquantizedEmbeddingMethod()
-                return UnquantizedLinearMethod()
-
-            quant_config = deepcopy(self)
-
-            if prefix:
-                # Dynamic per module/layer rules may override base config
-                quant_config.override_config(prefix=prefix)
-
-            return GPTQMarlinLinearMethod(quant_config)
-        elif isinstance(layer, FusedMoE):
+        if isinstance(layer, FusedMoE):
             return GPTQMarlinMoEMethod(self)
-        return None
+        return get_linear_quant_method(self, layer, prefix,
+                                       GPTQMarlinLinearMethod)
 
     @classmethod
     def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
new file mode 100644
index 000000000000..7552f9d1705a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+import re
+from copy import deepcopy
+from fractions import Fraction
+from typing import Dict, Optional, Union
+
+import torch
+
+from vllm.config import QuantizationConfig
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizeMethodBase)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, UnquantizedEmbeddingMethod)
+
+
+# Match dynamic rules with module name (prefix) and override quantize
+# config if module (prefix) matches a rule
+def override_config(config: QuantizationConfig, prefix: str):
+    weight_bits = get_dynamic_override(config, prefix, "bits",
+                                       config.weight_bits)
+    if isinstance(weight_bits, int):
+        config.weight_bits = weight_bits
+    group_size = get_dynamic_override(config, prefix, "group_size",
+                                      config.group_size)
+    if isinstance(group_size, int):
+        config.group_size = group_size
+    desc_act = get_dynamic_override(config, prefix, "desc_act",
+                                    config.desc_act)
+    if isinstance(desc_act, bool):
+        config.desc_act = desc_act
+
+    if config.get_name() == "gptq_marlin":
+        is_sym = get_dynamic_override(config, prefix, "sym", config.is_sym)
+        if isinstance(is_sym, bool):
+            config.is_sym = is_sym
+
+        config.pack_factor = 32 // config.weight_bits  # packed into int32
+        if (config.weight_bits, config.is_sym) not in config.TYPE_MAP:
+            raise ValueError("Unsupported quantization config: "
+                             f"bits={config.weight_bits}, sym={config.is_sym}")
+
+        config.quant_type = config.TYPE_MAP[(config.weight_bits,
+                                             config.is_sym)]
+    elif config.get_name() == "gptq":
+        config.pack_factor = Fraction(32, config.weight_bits)
+        if config.weight_bits not in [2, 3, 4, 8]:
+            raise ValueError(
+                "Currently, only 2/3/4/8-bit weight quantization is "
+                f"supported for GPTQ, but got {config.weight_bits} bits.")
+
+
+def get_dynamic_override(
+    config: QuantizationConfig,
+    layer_name: str,
+    key: Optional[str] = None,
+    default_value: Union[int, bool,
+                         None] = None) -> Union[Dict, int, bool, None]:
+    for pattern, pattern_dict in config.dynamic.items():
+        # Negative match: matched modules are excluded from quantized init
+        if pattern.startswith("-:"):
+            if re.match(pattern.removeprefix("-:"), layer_name):
+                return False
+        # Positive match: matched modules have quant properties overrides
+        # base quant config
+        elif re.match(pattern.removeprefix("+:"), layer_name):
+            if key is None:
+                return pattern_dict
+            else:
+                return pattern_dict.get(key, default_value)
+    return default_value
+
+
+def get_linear_quant_method(
+    config: QuantizationConfig,
+    layer: torch.nn.Module,
+    prefix: str,
+    linear_method_cls: type,
+) -> Optional[QuantizeMethodBase]:
+    cloned_config = deepcopy(config)
+    parallel_lm_head_quantized = isinstance(
+        layer, ParallelLMHead) and cloned_config.lm_head_quantized
+    if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
+        # False = skip module, None = no override, else = Positive match
+        if get_dynamic_override(  # noqa: E712
+                cloned_config,  # noqa: E712
+                layer_name=prefix) == False:  # noqa: E712
+            if parallel_lm_head_quantized:
+                return UnquantizedEmbeddingMethod()
+            return UnquantizedLinearMethod()
+
+        if prefix:
+            # Dynamic per module/layer rules may override base config
+            override_config(cloned_config, prefix=prefix)
+
+        return linear_method_cls(cloned_config)
+    return None

From 25dbd5a5aaeebc7162ec61b8a6a8f95e5911e69e Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 11 Feb 2025 08:46:46 +0000
Subject: [PATCH 54/56] cleanup

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 tests/quantization/test_gptq_dynamic.py           | 15 ++++++++++-----
 .../layers/quantization/utils/gptq_utils.py       | 13 +++++++++----
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index 88882a5bdbdd..71ad0835c8a3 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -3,6 +3,7 @@
 
 Run `pytest tests/quantization/test_gptq_dynamic.py --forked`.
 """
+from typing import Tuple
 
 import pytest
 import torch
@@ -20,16 +21,20 @@
 # The second layer is quantized using bits=8, group_size=32
 # All other layers (layer index >= 2) are not quantized
 MODEL_QUANT = [
-    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
-    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
+     True),
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
+     False),
 ]
 
 
-@pytest.mark.parametrize("model_id", MODEL_QUANT)
-def test_gptq_with_dynamic(vllm_runner, model_id: str):
+@pytest.mark.parametrize("model_id_and_use_marlin_kernel", MODEL_QUANT)
+def test_gptq_with_dynamic(vllm_runner,
+                           model_id_and_use_marlin_kernel: Tuple[str, bool]):
+    model_id, use_marlin_kernel = model_id_and_use_marlin_kernel
+
     vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
 
-    use_marlin_kernel = "symTrue" in model_id
     linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
         GPTQLinearMethod)
 
diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
index 7552f9d1705a..1e85316f0110 100644
--- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
@@ -2,18 +2,21 @@
 import re
 from copy import deepcopy
 from fractions import Fraction
-from typing import Dict, Optional, Union
+from typing import TYPE_CHECKING, Dict, Optional, Union
 
 import torch
 
 from vllm.config import QuantizationConfig
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizeMethodBase)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, UnquantizedEmbeddingMethod)
 
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+    from vllm.model_executor.layers.quantization.gptq_marlin import (
+        GPTQMarlinLinearMethod, GPTQMarlinMoEMethod)
+
 
 # Match dynamic rules with module name (prefix) and override quantize
 # config if module (prefix) matches a rule
@@ -77,7 +80,9 @@ def get_linear_quant_method(
     layer: torch.nn.Module,
     prefix: str,
     linear_method_cls: type,
-) -> Optional[QuantizeMethodBase]:
+) -> Optional[Union["GPTQLinearMethod", "GPTQMarlinLinearMethod",
+                    "GPTQMarlinMoEMethod", UnquantizedLinearMethod,
+                    UnquantizedEmbeddingMethod]]:
     cloned_config = deepcopy(config)
     parallel_lm_head_quantized = isinstance(
         layer, ParallelLMHead) and cloned_config.lm_head_quantized

From 874076c52423a87dd71f6252811df0e129324636 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Tue, 11 Feb 2025 09:07:01 +0000
Subject: [PATCH 55/56] cleanup

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 tests/quantization/test_gptq_dynamic.py               |  8 +++-----
 tests/quantization/test_lm_head.py                    | 10 ++++------
 .../layers/quantization/utils/gptq_utils.py           | 11 ++---------
 3 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index 71ad0835c8a3..c6f34fef2743 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -3,7 +3,6 @@
 
 Run `pytest tests/quantization/test_gptq_dynamic.py --forked`.
 """
-from typing import Tuple
 
 import pytest
 import torch
@@ -28,10 +27,9 @@
 ]
 
 
-@pytest.mark.parametrize("model_id_and_use_marlin_kernel", MODEL_QUANT)
-def test_gptq_with_dynamic(vllm_runner,
-                           model_id_and_use_marlin_kernel: Tuple[str, bool]):
-    model_id, use_marlin_kernel = model_id_and_use_marlin_kernel
+@pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
+def test_gptq_with_dynamic(vllm_runner, model_id: str,
+                           use_marlin_kernel: bool):
 
     vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
 
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index 3bbf1f0e4765..20435a287e37 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -3,7 +3,6 @@
 
 Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
 """
-from typing import Tuple
 
 import pytest
 import torch
@@ -25,14 +24,13 @@
 ]
 
 
-@pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)
+@pytest.mark.parametrize("model_id, lm_head_quantized", MODELS_QUANT)
 def test_lm_head(
     vllm_runner,
-    model_lm_head_quant: Tuple[str, bool],
+    model_id: str,
+    lm_head_quantized: bool,
 ) -> None:
-    model, lm_head_quantized = model_lm_head_quant
-
-    with vllm_runner(model, dtype=torch.float16,
+    with vllm_runner(model_id, dtype=torch.float16,
                      max_model_len=2048) as vllm_model:
 
         def check_model(model):
diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
index 1e85316f0110..fe2a7c959f6c 100644
--- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
@@ -2,7 +2,7 @@
 import re
 from copy import deepcopy
 from fractions import Fraction
-from typing import TYPE_CHECKING, Dict, Optional, Union
+from typing import Dict, Optional, Union
 
 import torch
 
@@ -12,11 +12,6 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, UnquantizedEmbeddingMethod)
 
-if TYPE_CHECKING:
-    from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
-    from vllm.model_executor.layers.quantization.gptq_marlin import (
-        GPTQMarlinLinearMethod, GPTQMarlinMoEMethod)
-
 
 # Match dynamic rules with module name (prefix) and override quantize
 # config if module (prefix) matches a rule
@@ -80,9 +75,7 @@ def get_linear_quant_method(
     layer: torch.nn.Module,
     prefix: str,
     linear_method_cls: type,
-) -> Optional[Union["GPTQLinearMethod", "GPTQMarlinLinearMethod",
-                    "GPTQMarlinMoEMethod", UnquantizedLinearMethod,
-                    UnquantizedEmbeddingMethod]]:
+):
     cloned_config = deepcopy(config)
     parallel_lm_head_quantized = isinstance(
         layer, ParallelLMHead) and cloned_config.lm_head_quantized

From c7f10be8548a3e31f7d472b8f151164d4a2ed3a9 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Wed, 12 Feb 2025 03:04:38 +0000
Subject: [PATCH 56/56] do not use Fraction

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 vllm/model_executor/layers/quantization/utils/gptq_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
index fe2a7c959f6c..5b0e6299f473 100644
--- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import re
 from copy import deepcopy
-from fractions import Fraction
 from typing import Dict, Optional, Union
 
 import torch
@@ -29,12 +28,12 @@ def override_config(config: QuantizationConfig, prefix: str):
     if isinstance(desc_act, bool):
         config.desc_act = desc_act
 
+    config.pack_factor = 32 // config.weight_bits  # packed into int32
     if config.get_name() == "gptq_marlin":
         is_sym = get_dynamic_override(config, prefix, "sym", config.is_sym)
         if isinstance(is_sym, bool):
             config.is_sym = is_sym
 
-        config.pack_factor = 32 // config.weight_bits  # packed into int32
         if (config.weight_bits, config.is_sym) not in config.TYPE_MAP:
             raise ValueError("Unsupported quantization config: "
                              f"bits={config.weight_bits}, sym={config.is_sym}")
@@ -42,7 +41,6 @@ def override_config(config: QuantizationConfig, prefix: str):
         config.quant_type = config.TYPE_MAP[(config.weight_bits,
                                              config.is_sym)]
     elif config.get_name() == "gptq":
-        config.pack_factor = Fraction(32, config.weight_bits)
         if config.weight_bits not in [2, 3, 4, 8]:
             raise ValueError(
                 "Currently, only 2/3/4/8-bit weight quantization is "