From f470b268afbbc7d14458c5df701a2e0aaea7da58 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Thu, 1 Aug 2024 14:20:39 +0000 Subject: [PATCH 01/56] gptq_marlin compat dynamic_bits quantize config --- .../layers/quantization/gptq_marlin.py | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index bdcc9c3b4f0c..4568c8364f90 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,3 +1,4 @@ +import re from typing import Any, Dict, List, Optional import torch @@ -23,29 +24,45 @@ class GPTQMarlinConfig(QuantizationConfig): """Config class for GPTQ Marlin""" def __init__(self, weight_bits: int, group_size: int, desc_act: bool, - is_sym: bool, lm_head_quantized: bool) -> None: + is_sym: bool, lm_head_quantized: bool, dynamic_bits: Dict[str, int]) -> None: if desc_act and group_size == -1: # In this case, act_order == True is the same as act_order == False # (since we have only one group per output channel) desc_act = False - self.weight_bits = weight_bits - self.pack_factor = 32 // self.weight_bits # packed into int32 + self.dynamic_bits = dynamic_bits + self._weight_bits = weight_bits + self._pack_factor = 32 // self._weight_bits # packed into int32 self.group_size = group_size self.desc_act = desc_act self.is_sym = is_sym self.lm_head_quantized = lm_head_quantized # Verify supported on platform. - verify_gptq_marlin_supported(num_bits=self.weight_bits, + verify_gptq_marlin_supported(num_bits=self._weight_bits, group_size=self.group_size, is_sym=self.is_sym) + def get_weight_bits(self, prefix: str): + real_bits = self._weight_bits + if len(self.dynamic_bits) > 0 and prefix: + remove_prefix = r'^.*?(?=\d)' + match_name = re.sub(remove_prefix, '', prefix) + for pattern, dm_bits in self.dynamic_bits.items(): + if re.match(pattern, match_name): + real_bits = dm_bits + break + return real_bits + + def get_pack_factor(self, prefix: str): + return 32 // self.get_weight_bits(prefix) # packed into int32 + def __repr__(self) -> str: - return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, " + return (f"GPTQMarlinConfig(weight_bits={self._weight_bits}, " f"group_size={self.group_size}, " f"desc_act={self.desc_act}, " - f"lm_head_quantized={self.lm_head_quantized})") + f"lm_head_quantized={self.lm_head_quantized}), " + f"dynamic_bits={self.dynamic_bits}") @classmethod def get_name(cls) -> str: @@ -65,6 +82,7 @@ def get_config_filenames(cls) -> List[str]: @classmethod def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig": + dynamic_bits = cls.get_from_keys_or(config, ["dynamic_bits"], default={}) weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) desc_act = cls.get_from_keys(config, ["desc_act"]) @@ -72,7 +90,7 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig": lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) return cls(weight_bits, group_size, desc_act, is_sym, - lm_head_quantized) + lm_head_quantized, dynamic_bits) @classmethod def override_quantization_method(cls, hf_quant_cfg, @@ -150,6 +168,7 @@ def create_weights( **extra_weight_attrs, ) -> None: del output_size + self.prefix = extra_weight_attrs.get("prefix", "") output_size_per_partition = sum(output_partition_sizes) is_row_parallel = input_size != input_size_per_partition @@ -178,11 +197,10 @@ def create_weights( # shard the scales in TP>1 case. scales_and_zp_input_dim = 0 scales_and_zp_size = input_size_per_partition // group_size - # Quantized weights qweight = Parameter( torch.empty( - input_size_per_partition // self.quant_config.pack_factor, + input_size_per_partition // self.quant_config.get_pack_factor(self.prefix), output_size_per_partition, dtype=torch.int32, ), @@ -195,7 +213,7 @@ def create_weights( "input_dim": 0, "output_dim": 1, "packed_dim": 0, - "pack_factor": self.quant_config.pack_factor, + "pack_factor": self.quant_config.get_pack_factor(self.prefix), }, ) @@ -238,7 +256,7 @@ def create_weights( qzeros = Parameter( torch.empty( scales_and_zp_size, - output_size_per_partition // self.quant_config.pack_factor, + output_size_per_partition // self.quant_config.get_pack_factor(self.prefix), dtype=torch.int32, device="meta", ), @@ -251,7 +269,7 @@ def create_weights( "input_dim": scales_and_zp_input_dim, "output_dim": 1, "packed_dim": 1, - "pack_factor": self.quant_config.pack_factor, + "pack_factor": self.quant_config.get_pack_factor(self.prefix), }, ) @@ -293,7 +311,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: perm=layer.g_idx_sort_indices, size_k=layer.input_size_per_partition, size_n=layer.output_size_per_partition, - num_bits=self.quant_config.weight_bits) + num_bits=self.quant_config.get_weight_bits(self.prefix)) replace_tensor(layer, "qweight", marlin_qweight) # Permute scales from autogptq format to marlin format. @@ -319,7 +337,7 @@ def apply( g_idx=layer.g_idx, g_idx_sort_indices=layer.g_idx_sort_indices, workspace=layer.workspace, - num_bits=self.quant_config.weight_bits, + num_bits=self.quant_config.get_weight_bits(self.prefix), output_size_per_partition=layer.output_size_per_partition, input_size_per_partition=layer.input_size_per_partition, is_k_full=layer.is_k_full, From 502edb36e8ed45828bf5402c398ef34f4c8ab3dc Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Fri, 2 Aug 2024 10:41:03 +0800 Subject: [PATCH 02/56] Update gptq_marlin.py --- .../layers/quantization/gptq_marlin.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 4568c8364f90..cbcead0c7c20 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -44,15 +44,14 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool, is_sym=self.is_sym) def get_weight_bits(self, prefix: str): - real_bits = self._weight_bits + bits = self._weight_bits + # check for variable/dynamic bits if len(self.dynamic_bits) > 0 and prefix: - remove_prefix = r'^.*?(?=\d)' - match_name = re.sub(remove_prefix, '', prefix) - for pattern, dm_bits in self.dynamic_bits.items(): - if re.match(pattern, match_name): - real_bits = dm_bits + for pattern, dym_bits in self.dynamic_bits.items(): + if re.match(pattern, prefix): + bits = dym_bits break - return real_bits + return bits def get_pack_factor(self, prefix: str): return 32 // self.get_weight_bits(prefix) # packed into int32 From 18064cd0400a41375b9cdfb832e4ce3169be713d Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 2 Aug 2024 03:23:30 +0000 Subject: [PATCH 03/56] cleanup --- .../layers/quantization/gptq_marlin.py | 40 ++++++++++--------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index cbcead0c7c20..91f9053c8401 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,4 +1,5 @@ import re +from copy import deepcopy from typing import Any, Dict, List, Optional import torch @@ -31,33 +32,34 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool, desc_act = False self.dynamic_bits = dynamic_bits - self._weight_bits = weight_bits - self._pack_factor = 32 // self._weight_bits # packed into int32 + self.weight_bits = weight_bits + self.pack_factor = 32 // self.weight_bits # packed into int32 self.group_size = group_size self.desc_act = desc_act self.is_sym = is_sym self.lm_head_quantized = lm_head_quantized # Verify supported on platform. - verify_gptq_marlin_supported(num_bits=self._weight_bits, + verify_gptq_marlin_supported(num_bits=self.weight_bits, group_size=self.group_size, is_sym=self.is_sym) - def get_weight_bits(self, prefix: str): - bits = self._weight_bits + def update_bits_and_pack_factor(self, prefix: str): + print("lll", prefix) + bits = self.weight_bits # check for variable/dynamic bits if len(self.dynamic_bits) > 0 and prefix: for pattern, dym_bits in self.dynamic_bits.items(): + print("re.match(pattern, prefix)",re.match(pattern, prefix), prefix) if re.match(pattern, prefix): bits = dym_bits break - return bits - - def get_pack_factor(self, prefix: str): - return 32 // self.get_weight_bits(prefix) # packed into int32 + if bits != self.weight_bits: + self.weight_bits = bits + self.pack_factor = 32 // self.weight_bits # packed into int32 def __repr__(self) -> str: - return (f"GPTQMarlinConfig(weight_bits={self._weight_bits}, " + return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, " f"group_size={self.group_size}, " f"desc_act={self.desc_act}, " f"lm_head_quantized={self.lm_head_quantized}), " @@ -154,7 +156,7 @@ class GPTQMarlinLinearMethod(LinearMethodBase): """ def __init__(self, quant_config: GPTQMarlinConfig) -> None: - self.quant_config = quant_config + self.quant_config = deepcopy(quant_config) def create_weights( self, @@ -167,7 +169,9 @@ def create_weights( **extra_weight_attrs, ) -> None: del output_size - self.prefix = extra_weight_attrs.get("prefix", "") + prefix = extra_weight_attrs.get("prefix", "") + self.quant_config.update_bits_and_pack_factor(prefix=prefix) + print("wwww", self.quant_config.weight_bits) output_size_per_partition = sum(output_partition_sizes) is_row_parallel = input_size != input_size_per_partition @@ -199,7 +203,7 @@ def create_weights( # Quantized weights qweight = Parameter( torch.empty( - input_size_per_partition // self.quant_config.get_pack_factor(self.prefix), + input_size_per_partition // self.quant_config.pack_factor, output_size_per_partition, dtype=torch.int32, ), @@ -212,7 +216,7 @@ def create_weights( "input_dim": 0, "output_dim": 1, "packed_dim": 0, - "pack_factor": self.quant_config.get_pack_factor(self.prefix), + "pack_factor": self.quant_config.pack_factor, }, ) @@ -255,7 +259,7 @@ def create_weights( qzeros = Parameter( torch.empty( scales_and_zp_size, - output_size_per_partition // self.quant_config.get_pack_factor(self.prefix), + output_size_per_partition // self.quant_config.pack_factor, dtype=torch.int32, device="meta", ), @@ -268,7 +272,7 @@ def create_weights( "input_dim": scales_and_zp_input_dim, "output_dim": 1, "packed_dim": 1, - "pack_factor": self.quant_config.get_pack_factor(self.prefix), + "pack_factor": self.quant_config.pack_factor, }, ) @@ -310,7 +314,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: perm=layer.g_idx_sort_indices, size_k=layer.input_size_per_partition, size_n=layer.output_size_per_partition, - num_bits=self.quant_config.get_weight_bits(self.prefix)) + num_bits=self.quant_config.weight_bits) replace_tensor(layer, "qweight", marlin_qweight) # Permute scales from autogptq format to marlin format. @@ -336,7 +340,7 @@ def apply( g_idx=layer.g_idx, g_idx_sort_indices=layer.g_idx_sort_indices, workspace=layer.workspace, - num_bits=self.quant_config.get_weight_bits(self.prefix), + num_bits=self.quant_config.weight_bits, output_size_per_partition=layer.output_size_per_partition, input_size_per_partition=layer.input_size_per_partition, is_k_full=layer.is_k_full, From 1b132c379eac2e7a769f1bcc32973ee5486b70b9 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 2 Aug 2024 03:25:38 +0000 Subject: [PATCH 04/56] cleanup --- vllm/model_executor/layers/quantization/gptq_marlin.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 91f9053c8401..213ca0c5921a 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -169,9 +169,11 @@ def create_weights( **extra_weight_attrs, ) -> None: del output_size + prefix = extra_weight_attrs.get("prefix", "") + # Depending on prefix and dynamic_bits, bits and pack_factor may be modified. self.quant_config.update_bits_and_pack_factor(prefix=prefix) - print("wwww", self.quant_config.weight_bits) + output_size_per_partition = sum(output_partition_sizes) is_row_parallel = input_size != input_size_per_partition From 4b6375443d34d07994f6cf62a95b58a0b4ce2dba Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 2 Aug 2024 03:26:20 +0000 Subject: [PATCH 05/56] cleanup --- vllm/model_executor/layers/quantization/gptq_marlin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 213ca0c5921a..d91c098becea 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -202,6 +202,7 @@ def create_weights( # shard the scales in TP>1 case. scales_and_zp_input_dim = 0 scales_and_zp_size = input_size_per_partition // group_size + # Quantized weights qweight = Parameter( torch.empty( From 90258d2856339e7e0412b4e4c4b8137cba69db4f Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 2 Aug 2024 03:26:58 +0000 Subject: [PATCH 06/56] cleanup --- vllm/model_executor/layers/quantization/gptq_marlin.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index d91c098becea..d8a9c1b7b52f 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -45,12 +45,10 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool, is_sym=self.is_sym) def update_bits_and_pack_factor(self, prefix: str): - print("lll", prefix) bits = self.weight_bits # check for variable/dynamic bits if len(self.dynamic_bits) > 0 and prefix: for pattern, dym_bits in self.dynamic_bits.items(): - print("re.match(pattern, prefix)",re.match(pattern, prefix), prefix) if re.match(pattern, prefix): bits = dym_bits break From a5d3c8b4ef6f0f3526c355288d6929074ba5408a Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 2 Aug 2024 03:23:30 +0000 Subject: [PATCH 07/56] cleanup cleanup cleanup cleanup --- .../layers/quantization/gptq_marlin.py | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index cbcead0c7c20..d8a9c1b7b52f 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,4 +1,5 @@ import re +from copy import deepcopy from typing import Any, Dict, List, Optional import torch @@ -31,33 +32,32 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool, desc_act = False self.dynamic_bits = dynamic_bits - self._weight_bits = weight_bits - self._pack_factor = 32 // self._weight_bits # packed into int32 + self.weight_bits = weight_bits + self.pack_factor = 32 // self.weight_bits # packed into int32 self.group_size = group_size self.desc_act = desc_act self.is_sym = is_sym self.lm_head_quantized = lm_head_quantized # Verify supported on platform. - verify_gptq_marlin_supported(num_bits=self._weight_bits, + verify_gptq_marlin_supported(num_bits=self.weight_bits, group_size=self.group_size, is_sym=self.is_sym) - def get_weight_bits(self, prefix: str): - bits = self._weight_bits + def update_bits_and_pack_factor(self, prefix: str): + bits = self.weight_bits # check for variable/dynamic bits if len(self.dynamic_bits) > 0 and prefix: for pattern, dym_bits in self.dynamic_bits.items(): if re.match(pattern, prefix): bits = dym_bits break - return bits - - def get_pack_factor(self, prefix: str): - return 32 // self.get_weight_bits(prefix) # packed into int32 + if bits != self.weight_bits: + self.weight_bits = bits + self.pack_factor = 32 // self.weight_bits # packed into int32 def __repr__(self) -> str: - return (f"GPTQMarlinConfig(weight_bits={self._weight_bits}, " + return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, " f"group_size={self.group_size}, " f"desc_act={self.desc_act}, " f"lm_head_quantized={self.lm_head_quantized}), " @@ -154,7 +154,7 @@ class GPTQMarlinLinearMethod(LinearMethodBase): """ def __init__(self, quant_config: GPTQMarlinConfig) -> None: - self.quant_config = quant_config + self.quant_config = deepcopy(quant_config) def create_weights( self, @@ -167,7 +167,11 @@ def create_weights( **extra_weight_attrs, ) -> None: del output_size - self.prefix = extra_weight_attrs.get("prefix", "") + + prefix = extra_weight_attrs.get("prefix", "") + # Depending on prefix and dynamic_bits, bits and pack_factor may be modified. + self.quant_config.update_bits_and_pack_factor(prefix=prefix) + output_size_per_partition = sum(output_partition_sizes) is_row_parallel = input_size != input_size_per_partition @@ -196,10 +200,11 @@ def create_weights( # shard the scales in TP>1 case. scales_and_zp_input_dim = 0 scales_and_zp_size = input_size_per_partition // group_size + # Quantized weights qweight = Parameter( torch.empty( - input_size_per_partition // self.quant_config.get_pack_factor(self.prefix), + input_size_per_partition // self.quant_config.pack_factor, output_size_per_partition, dtype=torch.int32, ), @@ -212,7 +217,7 @@ def create_weights( "input_dim": 0, "output_dim": 1, "packed_dim": 0, - "pack_factor": self.quant_config.get_pack_factor(self.prefix), + "pack_factor": self.quant_config.pack_factor, }, ) @@ -255,7 +260,7 @@ def create_weights( qzeros = Parameter( torch.empty( scales_and_zp_size, - output_size_per_partition // self.quant_config.get_pack_factor(self.prefix), + output_size_per_partition // self.quant_config.pack_factor, dtype=torch.int32, device="meta", ), @@ -268,7 +273,7 @@ def create_weights( "input_dim": scales_and_zp_input_dim, "output_dim": 1, "packed_dim": 1, - "pack_factor": self.quant_config.get_pack_factor(self.prefix), + "pack_factor": self.quant_config.pack_factor, }, ) @@ -310,7 +315,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: perm=layer.g_idx_sort_indices, size_k=layer.input_size_per_partition, size_n=layer.output_size_per_partition, - num_bits=self.quant_config.get_weight_bits(self.prefix)) + num_bits=self.quant_config.weight_bits) replace_tensor(layer, "qweight", marlin_qweight) # Permute scales from autogptq format to marlin format. @@ -336,7 +341,7 @@ def apply( g_idx=layer.g_idx, g_idx_sort_indices=layer.g_idx_sort_indices, workspace=layer.workspace, - num_bits=self.quant_config.get_weight_bits(self.prefix), + num_bits=self.quant_config.weight_bits, output_size_per_partition=layer.output_size_per_partition, input_size_per_partition=layer.input_size_per_partition, is_k_full=layer.is_k_full, From 5682124103f7706a92cd3f9bd69d00f3c74f67c2 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 2 Aug 2024 08:48:54 +0000 Subject: [PATCH 08/56] load "dynamic" field from config --- .../layers/quantization/gptq_marlin.py | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index d8a9c1b7b52f..a6c28a90671b 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,6 +1,6 @@ import re from copy import deepcopy -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import torch from torch.nn.parameter import Parameter @@ -25,13 +25,13 @@ class GPTQMarlinConfig(QuantizationConfig): """Config class for GPTQ Marlin""" def __init__(self, weight_bits: int, group_size: int, desc_act: bool, - is_sym: bool, lm_head_quantized: bool, dynamic_bits: Dict[str, int]) -> None: + is_sym: bool, lm_head_quantized: bool, dynamic: Dict[str, Dict[str, Union[int, bool]]]) -> None: if desc_act and group_size == -1: # In this case, act_order == True is the same as act_order == False # (since we have only one group per output channel) desc_act = False - self.dynamic_bits = dynamic_bits + self.dynamic = dynamic self.weight_bits = weight_bits self.pack_factor = 32 // self.weight_bits # packed into int32 self.group_size = group_size @@ -44,13 +44,16 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool, group_size=self.group_size, is_sym=self.is_sym) - def update_bits_and_pack_factor(self, prefix: str): + def update_config(self, prefix: str): bits = self.weight_bits - # check for variable/dynamic bits - if len(self.dynamic_bits) > 0 and prefix: - for pattern, dym_bits in self.dynamic_bits.items(): + # check for variable/dynamic config + if len(self.dynamic) > 0 and prefix: + for pattern, dym in self.dynamic.items(): if re.match(pattern, prefix): - bits = dym_bits + bits = dym.get("bits", bits) + self.group_size = dym.get("group_size", self.group_size) + self.desc_act = dym.get("bits", self.desc_act) + self.is_sym = dym.get("is_sym", self.is_sym) break if bits != self.weight_bits: self.weight_bits = bits @@ -61,7 +64,7 @@ def __repr__(self) -> str: f"group_size={self.group_size}, " f"desc_act={self.desc_act}, " f"lm_head_quantized={self.lm_head_quantized}), " - f"dynamic_bits={self.dynamic_bits}") + f"dynamic={self.dynamic}") @classmethod def get_name(cls) -> str: @@ -81,7 +84,7 @@ def get_config_filenames(cls) -> List[str]: @classmethod def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig": - dynamic_bits = cls.get_from_keys_or(config, ["dynamic_bits"], default={}) + dynamic = cls.get_from_keys_or(config, ["dynamic"], default={}) weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) desc_act = cls.get_from_keys(config, ["desc_act"]) @@ -89,7 +92,7 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig": lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) return cls(weight_bits, group_size, desc_act, is_sym, - lm_head_quantized, dynamic_bits) + lm_head_quantized, dynamic) @classmethod def override_quantization_method(cls, hf_quant_cfg, @@ -169,8 +172,8 @@ def create_weights( del output_size prefix = extra_weight_attrs.get("prefix", "") - # Depending on prefix and dynamic_bits, bits and pack_factor may be modified. - self.quant_config.update_bits_and_pack_factor(prefix=prefix) + # Depending on prefix and dynamic, some arguments may be modified. + self.quant_config.update_config(prefix=prefix) output_size_per_partition = sum(output_partition_sizes) is_row_parallel = input_size != input_size_per_partition From d651668b14b80affb96385cc31227651b05f04b0 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 2 Aug 2024 17:30:51 +0000 Subject: [PATCH 09/56] fix key error: change "is_sym" to "sym" --- vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index a6c28a90671b..d0efd97e8907 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -53,7 +53,7 @@ def update_config(self, prefix: str): bits = dym.get("bits", bits) self.group_size = dym.get("group_size", self.group_size) self.desc_act = dym.get("bits", self.desc_act) - self.is_sym = dym.get("is_sym", self.is_sym) + self.is_sym = dym.get("sym", self.is_sym) break if bits != self.weight_bits: self.weight_bits = bits From e9ae8f5fa7168de585396b57c34a22a102459f26 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 6 Aug 2024 03:11:10 +0000 Subject: [PATCH 10/56] update quant_type --- .../model_executor/layers/quantization/gptq_marlin.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 8dc28c6f6264..71654fac4edf 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -40,6 +40,7 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool, self.dynamic = dynamic self.weight_bits = weight_bits + self.is_sym = is_sym self.pack_factor = 32 // weight_bits # packed into int32 self.group_size = group_size self.desc_act = desc_act @@ -66,9 +67,13 @@ def update_config(self, prefix: str): self.desc_act = dym.get("bits", self.desc_act) self.is_sym = dym.get("sym", self.is_sym) break - if bits != self.weight_bits: - self.weight_bits = bits - self.pack_factor = 32 // self.weight_bits # packed into int32 + + self.pack_factor = 32 // bits # packed into int32 + if (bits, self.is_sym) not in self.TYPE_MAP: + raise ValueError("Unsupported quantization config: " + f"bits={bits}, sym={self.is_sym}") + + self.quant_type = self.TYPE_MAP[(bits, self.is_sym)] def __repr__(self) -> str: return (f"GPTQMarlinConfig(quant_type={self.quant_type}, " From 19d77723aa88c1da28f9f98a64fde3c16a7e4c30 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 13:49:57 +0800 Subject: [PATCH 11/56] update --- .../layers/quantization/gptq_marlin.py | 558 +++++++++++++----- 1 file changed, 398 insertions(+), 160 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 71654fac4edf..9c125dd5149e 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,22 +1,24 @@ import re from copy import deepcopy -from typing import Any, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Set, Union import torch -from torch.nn.parameter import Parameter - +import vllm.model_executor.layers.fused_moe # noqa from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, - set_weight_attrs) -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - apply_gptq_marlin_linear, check_marlin_supported, marlin_is_k_full, - marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales, - marlin_repeat_scales_on_all_ranks, marlin_sort_g_idx, replace_tensor, - verify_marlin_supported, verify_marlin_supports_shape) +from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported +from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase, UnquantizedLinearMethod, set_weight_attrs +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.kernels import MPLinearLayerConfig, choose_mp_linear_kernel +from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.marlin_utils import (check_marlin_supported, + marlin_moe_permute_scales, + marlin_repeat_scales_on_all_ranks, + verify_marlin_supported) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.parameter import (ChannelQuantScaleParameter, GroupQuantScaleParameter, + PackedColumnParameter, PackedvLLMParameter, RowvLLMParameter) +from vllm.platforms import current_platform from vllm.scalar_type import scalar_types logger = init_logger(__name__) @@ -31,8 +33,15 @@ class GPTQMarlinConfig(QuantizationConfig): (8, True): scalar_types.uint8b128, } - def __init__(self, weight_bits: int, group_size: int, desc_act: bool, - is_sym: bool, lm_head_quantized: bool, dynamic: Dict[str, Dict[str, Union[int, bool]]]) -> None: + def __init__( + self, + weight_bits: int, + group_size: int, + desc_act: bool, + is_sym: bool, + lm_head_quantized: bool, + dynamic: Dict[str, Dict[str, Union[int, bool]]] + ) -> None: if desc_act and group_size == -1: # In this case, act_order == True is the same as act_order == False # (since we have only one group per output channel) @@ -41,6 +50,7 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool, self.dynamic = dynamic self.weight_bits = weight_bits self.is_sym = is_sym + self.pack_factor = 32 // weight_bits # packed into int32 self.group_size = group_size self.desc_act = desc_act @@ -52,21 +62,14 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool, self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)] - # Verify supported on platform. - verify_marlin_supported(quant_type=self.quant_type, - group_size=self.group_size) - def update_config(self, prefix: str): bits = self.weight_bits # check for variable/dynamic config - if len(self.dynamic) > 0 and prefix: - for pattern, dym in self.dynamic.items(): - if re.match(pattern, prefix): - bits = dym.get("bits", bits) - self.group_size = dym.get("group_size", self.group_size) - self.desc_act = dym.get("bits", self.desc_act) - self.is_sym = dym.get("sym", self.is_sym) - break + if self.dynamic and len(self.dynamic) > 0 and prefix: + bits = self.dynamic_get(prefix, "bits", bits) + self.group_size = self.dynamic_get(prefix, "group_size", self.group_size) + self.desc_act = self.dynamic_get(prefix, "desc_act", self.desc_act) + self.is_sym = self.dynamic_get(prefix, "sym", self.is_sym) self.pack_factor = 32 // bits # packed into int32 if (bits, self.is_sym) not in self.TYPE_MAP: @@ -131,24 +134,42 @@ def override_quantization_method(cls, hf_quant_cfg, " faster inference") return None + def dynamic_get(self, layer_name: str, key: str = None, default_value: Union[int, bool] = None) -> Union[Dict, int, bool]: + for pattern, pattern_dict in self.dynamic.items(): + if pattern.startswith("-:"): + if re.match(pattern.removeprefix("-:"), layer_name): + return False + elif re.match(pattern.removeprefix("+:"), layer_name): + if key is None: + return pattern_dict + else: + return pattern_dict.get(key, default_value) + return default_value + def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["GPTQMarlinLinearMethod"]: - if (isinstance(layer, LinearBase) or - (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)): - return GPTQMarlinLinearMethod(self) + prefix: str + ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod", UnquantizedLinearMethod]]: + if self.dynamic and self.dynamic_get(layer_name=prefix) == False: # noqa: E712 + return UnquantizedLinearMethod() + + if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead) + and self.lm_head_quantized): + return GPTQMarlinLinearMethod(self, prefix=prefix) + elif isinstance(layer, FusedMoE): + return GPTQMarlinMoEMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - @classmethod def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]): # Extract data from quant config. quant_method = quant_config.get("quant_method", "").lower() - num_bits = quant_config.get("bits", None) - group_size = quant_config.get("group_size", None) - sym = quant_config.get("sym", None) - desc_act = quant_config.get("desc_act", None) + num_bits = quant_config.get("bits") + group_size = quant_config.get("group_size") + sym = quant_config.get("sym") + desc_act = quant_config.get("desc_act") + + if not current_platform.is_cuda(): + return False if quant_method != "gptq": return False @@ -162,8 +183,7 @@ def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]): return False return check_marlin_supported(quant_type=cls.TYPE_MAP[(num_bits, sym)], - group_size=group_size, - min_capability=cls.get_min_capability()) + group_size=group_size) class GPTQMarlinLinearMethod(LinearMethodBase): @@ -173,8 +193,15 @@ class GPTQMarlinLinearMethod(LinearMethodBase): quant_config: The GPTQ Marlin quantization config. """ - def __init__(self, quant_config: GPTQMarlinConfig) -> None: + _kernel_backends_being_used: Set[str] = set() + + def __init__(self, quant_config: GPTQMarlinConfig, prefix: str) -> None: self.quant_config = deepcopy(quant_config) + self.prefix = prefix + + # Verify supported on platform. + verify_marlin_supported(quant_type=self.quant_config.quant_type, + group_size=self.quant_config.group_size) def create_weights( self, @@ -186,14 +213,30 @@ def create_weights( params_dtype: torch.dtype, **extra_weight_attrs, ) -> None: - del output_size - - prefix = extra_weight_attrs.get("prefix", "") # Depending on prefix and dynamic, some arguments may be modified. - self.quant_config.update_config(prefix=prefix) + self.quant_config.update_config(prefix=self.prefix) output_size_per_partition = sum(output_partition_sizes) is_row_parallel = input_size != input_size_per_partition + weight_loader = extra_weight_attrs.get("weight_loader") + + mp_linear_kernel_config = MPLinearLayerConfig( + full_weight_shape=(input_size, output_size), + partition_weight_shape=\ + (input_size_per_partition, output_size_per_partition), + weight_type=self.quant_config.quant_type, + act_type=params_dtype, + group_size=self.quant_config.group_size, + zero_points=False, + has_g_idx=self.quant_config.desc_act + ) + + kernel_type = choose_mp_linear_kernel(mp_linear_kernel_config) + + if kernel_type.__name__ not in self._kernel_backends_being_used: + logger.info("Using %s for GPTQMarlinLinearMethod", + kernel_type.__name__) + self._kernel_backends_being_used.add(kernel_type.__name__) # Normalize group_size if self.quant_config.group_size != -1: @@ -201,12 +244,6 @@ def create_weights( else: group_size = input_size - verify_marlin_supports_shape( - output_size_per_partition=output_size_per_partition, - input_size_per_partition=input_size_per_partition, - input_size=input_size, - group_size=group_size) - # Determine sharding if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act, self.quant_config.group_size, @@ -222,147 +259,348 @@ def create_weights( scales_and_zp_size = input_size_per_partition // group_size # Quantized weights - qweight = Parameter( - torch.empty( + qweight = PackedvLLMParameter( + data=torch.empty( input_size_per_partition // self.quant_config.pack_factor, output_size_per_partition, dtype=torch.int32, ), - requires_grad=False, - ) - set_weight_attrs( - qweight, - { - **extra_weight_attrs, - "input_dim": 0, - "output_dim": 1, - "packed_dim": 0, - "pack_factor": self.quant_config.pack_factor, - }, - ) + input_dim=0, + output_dim=1, + packed_dim=0, + packed_factor=self.quant_config.pack_factor, + weight_loader=weight_loader) # Activation order - g_idx = Parameter( + g_idx = RowvLLMParameter(data=torch.empty( + input_size_per_partition, + dtype=torch.int32, + ), + input_dim=0, + weight_loader=weight_loader) + + qzeros_args = { + "data": torch.empty( - input_size_per_partition, + scales_and_zp_size, + output_size_per_partition // self.quant_config.pack_factor, dtype=torch.int32, ), - requires_grad=False, - ) - # Ignore warning from fused linear layers such as QKVParallelLinear. - set_weight_attrs( - g_idx, - { - **extra_weight_attrs, "input_dim": 0, - "ignore_warning": True - }, - ) - - # Scales - scales = Parameter( + "weight_loader": + weight_loader + } + weight_scale_args = { + "data": torch.empty( scales_and_zp_size, output_size_per_partition, dtype=params_dtype, ), + "weight_loader": + weight_loader + } + + if scales_and_zp_input_dim is None: + scales = ChannelQuantScaleParameter(output_dim=1, + **weight_scale_args) + qzeros = PackedColumnParameter( + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + **qzeros_args) + + else: + scales = GroupQuantScaleParameter(output_dim=1, + input_dim=0, + **weight_scale_args) + qzeros = PackedvLLMParameter( + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=self.quant_config.pack_factor, + **qzeros_args) + + layer.register_parameter("qweight", qweight) + layer.register_parameter("g_idx", g_idx) + layer.register_parameter("scales", scales) + layer.register_parameter("qzeros", qzeros) + + self.kernel = kernel_type(mp_linear_kernel_config, + w_q_param_name="qweight", + w_s_param_name="scales", + w_zp_param_name="qzeros", + w_gidx_param_name="g_idx") + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + self.kernel.process_weights_after_loading(layer) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + return self.kernel.apply_weights(layer, x, bias) + + +class GPTQMarlinMoEMethod(FusedMoEMethodBase): + """MoE Marlin method with quantization.""" + + def __init__(self, quant_config: GPTQMarlinConfig) -> None: + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + # Currently assuming is_k_full is always True + # (input size per partition is the same as full input size) + # Supports only sym for now (no zp) + if self.quant_config.group_size != -1: + scales_size13 = hidden_size // self.quant_config.group_size + scales_size2 = intermediate_size // self.quant_config.group_size + strategy = FusedMoeWeightScaleSupported.GROUP.value + else: + scales_size13 = 1 + scales_size2 = 1 + strategy = FusedMoeWeightScaleSupported.CHANNEL.value + + extra_weight_attrs.update({ + "quant_method": strategy, + "is_transposed": True + }) + # Fused gate_up_proj (column parallel) + w13_qweight = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size // self.quant_config.pack_factor, + 2 * intermediate_size, + dtype=torch.int32, + ), requires_grad=False, ) - set_weight_attrs( - scales, - { - **extra_weight_attrs, - "input_dim": scales_and_zp_input_dim, - "output_dim": 1, - }, + layer.register_parameter("w13_qweight", w13_qweight) + set_weight_attrs(w13_qweight, extra_weight_attrs) + # down_proj (row parallel) + w2_qweight = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size // self.quant_config.pack_factor, + hidden_size, + dtype=torch.int32, + ), + requires_grad=False, ) - - # Quantized zero-points - qzeros = Parameter( + layer.register_parameter("w2_qweight", w2_qweight) + set_weight_attrs(w2_qweight, extra_weight_attrs) + # up_proj scales + w13_scales = torch.nn.Parameter( + torch.empty(num_experts, + scales_size13, + 2 * intermediate_size, + dtype=torch.half), + requires_grad=False, + ) + layer.register_parameter("w13_scales", w13_scales) + set_weight_attrs(w13_scales, extra_weight_attrs) + # down_proj scales + w2_scales = torch.nn.Parameter( + torch.empty(num_experts, + scales_size2, + hidden_size, + dtype=torch.half), + requires_grad=False, + ) + layer.register_parameter("w2_scales", w2_scales) + set_weight_attrs(w2_scales, extra_weight_attrs) + # up_proj scales + w13_qzeros = torch.nn.Parameter( + torch.empty(num_experts, + scales_size13, + 2 * intermediate_size // self.quant_config.pack_factor, + dtype=params_dtype), + requires_grad=False, + ) + layer.register_parameter("w13_qzeros", w13_qzeros) + set_weight_attrs(w13_qzeros, extra_weight_attrs) + # down_proj scales + w2_qzeros = torch.nn.Parameter( + torch.empty(num_experts, + scales_size2, + hidden_size // self.quant_config.pack_factor, + dtype=params_dtype), + requires_grad=False, + ) + layer.register_parameter("w2_qzeros", w2_qzeros) + set_weight_attrs(w2_qzeros, extra_weight_attrs) + w13_g_idx = torch.nn.Parameter( torch.empty( - scales_and_zp_size, - output_size_per_partition // self.quant_config.pack_factor, + num_experts, + hidden_size, dtype=torch.int32, - device="meta", ), requires_grad=False, ) - set_weight_attrs( - qzeros, - { - **extra_weight_attrs, - "input_dim": scales_and_zp_input_dim, - "output_dim": 1, - "packed_dim": 1, - "pack_factor": self.quant_config.pack_factor, - }, + layer.register_parameter("w13_g_idx", w13_g_idx) + set_weight_attrs(w13_g_idx, extra_weight_attrs) + w2_g_idx = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size, + dtype=torch.int32, + ), + requires_grad=False, ) + layer.register_parameter("w2_g_idx", w2_g_idx) + set_weight_attrs(w2_g_idx, extra_weight_attrs) + w13_g_idx_sort_indices = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_g_idx_sort_indices", + w13_g_idx_sort_indices) + set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs) + w2_g_idx_sort_indices = torch.nn.Parameter( + torch.empty( + num_experts, + intermediate_size, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("w2_g_idx_sort_indices", + w2_g_idx_sort_indices) + set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs) - layer.register_parameter("qweight", qweight) - layer.register_parameter("g_idx", g_idx) - layer.register_parameter("scales", scales) - layer.register_parameter("qzeros", qzeros) - layer.input_size_per_partition = input_size_per_partition - layer.output_size_per_partition = output_size_per_partition - layer.input_size = input_size - layer.is_k_full = marlin_is_k_full(self.quant_config.desc_act, - is_row_parallel) - - # Checkpoints are serialized in AutoGPTQ format, which is different from the - # marlin format. This function is called after the weights are loaded. - # Here, we handle the repacking, including the activation reordering case. def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - device = layer.qweight.device - # Allocate marlin workspace - layer.workspace = marlin_make_workspace( - layer.output_size_per_partition, device) - - # Handle sorting for activation reordering if needed. + # Process act_order if self.quant_config.desc_act: - g_idx, g_idx_sort_indices = marlin_sort_g_idx(layer.g_idx) - layer.g_idx_sort_indices = g_idx_sort_indices - replace_tensor(layer, "g_idx", g_idx) + # Get sorting based on g_idx + num_experts = layer.w13_g_idx.shape[0] + w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx) + w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx) + w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx) + w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx) + for e in range(num_experts): + w13_g_idx_sort_indices[e] = torch.argsort( + layer.w13_g_idx[e]).to(torch.int32) + w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to( + torch.int32) + w13_sorted_g_idx[e] = layer.w13_g_idx[e][ + w13_g_idx_sort_indices[e]] + w2_sorted_g_idx[e] = layer.w2_g_idx[e][ + w2_g_idx_sort_indices[e]] + replace_parameter(layer, "w13_g_idx", w13_sorted_g_idx) + replace_parameter(layer, "w2_g_idx", w2_sorted_g_idx) + replace_parameter(layer, "w13_g_idx_sort_indices", + w13_g_idx_sort_indices) + replace_parameter(layer, "w2_g_idx_sort_indices", + w2_g_idx_sort_indices) else: - layer.g_idx = marlin_make_empty_g_idx(device) - layer.g_idx_sort_indices = marlin_make_empty_g_idx(device) - - # No zero-point - layer.zp = marlin_make_empty_g_idx(device) - - # Repack weights from autogptq format to marlin format. - marlin_qweight = ops.gptq_marlin_repack( - layer.qweight, - perm=layer.g_idx_sort_indices, - size_k=layer.input_size_per_partition, - size_n=layer.output_size_per_partition, - num_bits=self.quant_config.quant_type.size_bits) - replace_tensor(layer, "qweight", marlin_qweight) - - # Permute scales from autogptq format to marlin format. - marlin_scales = marlin_permute_scales( - layer.scales, - size_k=(layer.input_size if self.quant_config.desc_act else - layer.input_size_per_partition), - size_n=layer.output_size_per_partition, - group_size=self.quant_config.group_size) - replace_tensor(layer, "scales", marlin_scales) + # Reset g_idx related tensors + num_experts = layer.w13_g_idx.shape[0] + device = layer.w13_g_idx.device + layer.w13_g_idx = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) + layer.w2_g_idx = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) + layer.w13_g_idx_sort_indices = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) + layer.w2_g_idx_sort_indices = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) + # Repack weights + marlin_w13_qweight = ops.gptq_marlin_moe_repack( + layer.w13_qweight, + layer.w13_g_idx_sort_indices, + layer.w13_qweight.shape[1] * self.quant_config.pack_factor, + layer.w13_qweight.shape[2], + self.quant_config.quant_type.size_bits, + ) + replace_parameter(layer, "w13_qweight", marlin_w13_qweight) + marlin_w2_qweight = ops.gptq_marlin_moe_repack( + layer.w2_qweight, + layer.w2_g_idx_sort_indices, + layer.w2_qweight.shape[1] * self.quant_config.pack_factor, + layer.w2_qweight.shape[2], + self.quant_config.quant_type.size_bits, + ) + replace_parameter(layer, "w2_qweight", marlin_w2_qweight) + # Repack scales + marlin_w13_scales = marlin_moe_permute_scales( + s=layer.w13_scales, + size_k=layer.intermediate_size_per_partition, + size_n=layer.w13_scales.shape[2], + group_size=self.quant_config.group_size, + ) + replace_parameter(layer, "w13_scales", marlin_w13_scales) + marlin_w2_scales = marlin_moe_permute_scales( + s=layer.w2_scales, + size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor, + size_n=layer.w2_scales.shape[2], + group_size=self.quant_config.group_size, + ) + replace_parameter(layer, "w2_scales", marlin_w2_scales) def apply( self, layer: torch.nn.Module, x: torch.Tensor, - bias: Optional[torch.Tensor] = None, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool = True, + use_grouped_topk: bool = False, + num_expert_group: Optional[int] = None, + topk_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, ) -> torch.Tensor: - return apply_gptq_marlin_linear( - input=x, - weight=layer.qweight, - weight_scale=layer.scales, - weight_zp=layer.zp, - g_idx=layer.g_idx, - g_idx_sort_indices=layer.g_idx_sort_indices, - workspace=layer.workspace, - wtype=self.quant_config.quant_type, - output_size_per_partition=layer.output_size_per_partition, - input_size_per_partition=layer.input_size_per_partition, - is_k_full=layer.is_k_full, - bias=bias) + # The input must currently be float16 + orig_dtype = x.dtype + x = x.half() + + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=None) + + return torch.ops.vllm.fused_marlin_moe( + x, + layer.w13_qweight, + layer.w2_qweight, + layer.w13_scales, + layer.w2_scales, + router_logits, + topk_weights, + topk_ids, + g_idx1=layer.w13_g_idx, + g_idx2=layer.w2_g_idx, + sort_indices1=layer.w13_g_idx_sort_indices, + sort_indices2=layer.w2_g_idx_sort_indices, + num_bits=self.quant_config.quant_type.size_bits, + ).to(orig_dtype) \ No newline at end of file From 856532804685b47661708d352c4850979f047ee7 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 16:15:11 +0800 Subject: [PATCH 12/56] fix judgment error --- vllm/model_executor/layers/quantization/gptq_marlin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 94187214b741..14dd51df62d7 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -157,11 +157,11 @@ def dynamic_get(self, layer_name: str, key: str = None, default_value: Union[int def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod", UnquantizedLinearMethod]]: - if self.dynamic and self.dynamic_get(layer_name=prefix) == False: # noqa: E712 - return UnquantizedLinearMethod() - if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead) and self.lm_head_quantized): + if self.dynamic and self.dynamic_get(layer_name=prefix) == False: # noqa: E712 + return UnquantizedLinearMethod() + return GPTQMarlinLinearMethod(self, prefix=prefix) elif isinstance(layer, FusedMoE): return GPTQMarlinMoEMethod(self) From 84ada54655b77dd2810d927cad3a4683bb1a0043 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 16:19:32 +0800 Subject: [PATCH 13/56] cleanup --- vllm/model_executor/layers/quantization/gptq_marlin.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 14dd51df62d7..18d36a1b230f 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -10,7 +10,8 @@ from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, - set_weight_attrs, UnquantizedLinearMethod) + set_weight_attrs, + UnquantizedLinearMethod) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.kernels import ( @@ -141,7 +142,8 @@ def override_quantization_method(cls, hf_quant_cfg, " faster inference") return None - def dynamic_get(self, layer_name: str, key: str = None, default_value: Union[int, bool] = None) -> Union[Dict, int, bool]: + def dynamic_get(self, layer_name: str, key: Optional[str] = None, default_value: Union[int, bool, None] = None + ) -> Union[Dict, int, bool]: for pattern, pattern_dict in self.dynamic.items(): if pattern.startswith("-:"): if re.match(pattern.removeprefix("-:"), layer_name): @@ -161,7 +163,7 @@ def get_quant_method( and self.lm_head_quantized): if self.dynamic and self.dynamic_get(layer_name=prefix) == False: # noqa: E712 return UnquantizedLinearMethod() - + return GPTQMarlinLinearMethod(self, prefix=prefix) elif isinstance(layer, FusedMoE): return GPTQMarlinMoEMethod(self) From e81a7da33392a310c670d069263e0be0987d61b0 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 16:28:17 +0800 Subject: [PATCH 14/56] cleanup --- vllm/model_executor/layers/quantization/gptq_marlin.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 18d36a1b230f..ed420f77ec53 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -71,11 +71,13 @@ def __init__( self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)] def update_config(self, prefix: str): - bits = self.weight_bits + bits: Optional[int] = self.weight_bits # check for variable/dynamic config if self.dynamic and len(self.dynamic) > 0 and prefix: bits = self.dynamic_get(prefix, "bits", bits) - self.group_size = self.dynamic_get(prefix, "group_size", self.group_size) + group_size = self.dynamic_get(prefix, "group_size", self.group_size) + assert group_size is not None + self.group_size = group_size self.desc_act = self.dynamic_get(prefix, "desc_act", self.desc_act) self.is_sym = self.dynamic_get(prefix, "sym", self.is_sym) @@ -158,7 +160,7 @@ def dynamic_get(self, layer_name: str, key: Optional[str] = None, default_value: def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod", UnquantizedLinearMethod]]: + ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]: if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead) and self.lm_head_quantized): if self.dynamic and self.dynamic_get(layer_name=prefix) == False: # noqa: E712 From 68291ce51551a62367c38b51145a6b89f578bd31 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 16:30:17 +0800 Subject: [PATCH 15/56] cleanup --- vllm/model_executor/layers/quantization/gptq_marlin.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index ed420f77ec53..81263ad51d90 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -75,9 +75,8 @@ def update_config(self, prefix: str): # check for variable/dynamic config if self.dynamic and len(self.dynamic) > 0 and prefix: bits = self.dynamic_get(prefix, "bits", bits) - group_size = self.dynamic_get(prefix, "group_size", self.group_size) - assert group_size is not None - self.group_size = group_size + self.group_size = self.dynamic_get(prefix, "group_size", + self.group_size) self.desc_act = self.dynamic_get(prefix, "desc_act", self.desc_act) self.is_sym = self.dynamic_get(prefix, "sym", self.is_sym) @@ -144,7 +143,8 @@ def override_quantization_method(cls, hf_quant_cfg, " faster inference") return None - def dynamic_get(self, layer_name: str, key: Optional[str] = None, default_value: Union[int, bool, None] = None + def dynamic_get(self, layer_name: str, key: Optional[str] = None, + default_value: Union[int, bool, None] = None ) -> Union[Dict, int, bool]: for pattern, pattern_dict in self.dynamic.items(): if pattern.startswith("-:"): From 78674057820283efac50fe98c1b5dd378b992453 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 16:47:32 +0800 Subject: [PATCH 16/56] cleanup --- .../layers/quantization/gptq_marlin.py | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 81263ad51d90..44efedfe92ef 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -10,8 +10,8 @@ from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, - set_weight_attrs, - UnquantizedLinearMethod) + UnquantizedLinearMethod, + set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.kernels import ( @@ -41,15 +41,9 @@ class GPTQMarlinConfig(QuantizationConfig): (8, True): scalar_types.uint8b128, } - def __init__( - self, - weight_bits: int, - group_size: int, - desc_act: bool, - is_sym: bool, - lm_head_quantized: bool, - dynamic: Dict[str, Dict[str, Union[int, bool]]] - ) -> None: + def __init__(self, weight_bits: int, group_size: int, desc_act: bool, + is_sym: bool, lm_head_quantized: bool, + dynamic: Dict[str, Dict[str, Union[int, bool]]]) -> None: if desc_act and group_size == -1: # In this case, act_order == True is the same as act_order == False # (since we have only one group per output channel) @@ -143,9 +137,12 @@ def override_quantization_method(cls, hf_quant_cfg, " faster inference") return None - def dynamic_get(self, layer_name: str, key: Optional[str] = None, - default_value: Union[int, bool, None] = None - ) -> Union[Dict, int, bool]: + def dynamic_get( + self, + layer_name: str, + key: Optional[str] = None, + default_value: Union[int, bool, + None] = None) -> Union[Dict, int, bool]: for pattern, pattern_dict in self.dynamic.items(): if pattern.startswith("-:"): if re.match(pattern.removeprefix("-:"), layer_name): @@ -157,13 +154,12 @@ def dynamic_get(self, layer_name: str, key: Optional[str] = None, return pattern_dict.get(key, default_value) return default_value - def get_quant_method( - self, layer: torch.nn.Module, prefix: str + self, layer: torch.nn.Module, prefix: str ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]: if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead) and self.lm_head_quantized): - if self.dynamic and self.dynamic_get(layer_name=prefix) == False: # noqa: E712 + if self.dynamic and not self.dynamic_get(layer_name=prefix): return UnquantizedLinearMethod() return GPTQMarlinLinearMethod(self, prefix=prefix) From c63ba512cd14b14b408da7807b28ac21090f47f1 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 17:44:03 +0800 Subject: [PATCH 17/56] cleanup --- .../layers/quantization/gptq_marlin.py | 33 ++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 44efedfe92ef..90a9688020c8 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -65,14 +65,22 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool, self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)] def update_config(self, prefix: str): - bits: Optional[int] = self.weight_bits + bits = self.weight_bits # check for variable/dynamic config if self.dynamic and len(self.dynamic) > 0 and prefix: - bits = self.dynamic_get(prefix, "bits", bits) - self.group_size = self.dynamic_get(prefix, "group_size", - self.group_size) - self.desc_act = self.dynamic_get(prefix, "desc_act", self.desc_act) - self.is_sym = self.dynamic_get(prefix, "sym", self.is_sym) + b = self.dynamic_get(prefix, "bits", bits) + if isinstance(b, int): + bits = b + group_size = self.dynamic_get(prefix, "group_size", + self.group_size) + if isinstance(group_size, int): + self.group_size = group_size + desc_act = self.dynamic_get(prefix, "desc_act", self.desc_act) + if isinstance(desc_act, bool): + self.desc_act = desc_act + is_sym = self.dynamic_get(prefix, "sym", self.is_sym) + if isinstance(is_sym, bool): + self.is_sym = is_sym self.pack_factor = 32 // bits # packed into int32 if (bits, self.is_sym) not in self.TYPE_MAP: @@ -141,8 +149,8 @@ def dynamic_get( self, layer_name: str, key: Optional[str] = None, - default_value: Union[int, bool, - None] = None) -> Union[Dict, int, bool]: + default_value: Union[int, bool, None] = None + ) -> Union[Dict, int, bool, None]: for pattern, pattern_dict in self.dynamic.items(): if pattern.startswith("-:"): if re.match(pattern.removeprefix("-:"), layer_name): @@ -156,11 +164,14 @@ def dynamic_get( def get_quant_method( self, layer: torch.nn.Module, prefix: str - ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]: + ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod", + UnquantizedLinearMethod]]: if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead) and self.lm_head_quantized): - if self.dynamic and not self.dynamic_get(layer_name=prefix): - return UnquantizedLinearMethod() + if self.dynamic: + result = self.dynamic_get(layer_name=prefix) + if result is not None and not result: + return UnquantizedLinearMethod() return GPTQMarlinLinearMethod(self, prefix=prefix) elif isinstance(layer, FusedMoE): From 5f9b712cc18ec6a71416f3b6b8d2022ac92fdb78 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Tue, 24 Dec 2024 17:51:15 +0800 Subject: [PATCH 18/56] Update gptq_marlin.py --- vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 90a9688020c8..ad188ee54f8d 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -232,7 +232,7 @@ def create_weights( params_dtype: torch.dtype, **extra_weight_attrs, ) -> None: - # Depending on prefix and dynamic, some arguments may be modified. + # gptqmodel per module/layer dynamic config my override/change base model quant config self.quant_config.update_config(prefix=self.prefix) output_size_per_partition = sum(output_partition_sizes) From 36925788aade16a66f022dc985d6f4209503b94a Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Tue, 24 Dec 2024 17:59:55 +0800 Subject: [PATCH 19/56] Update gptq_marlin.py --- vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index ad188ee54f8d..c007aedaf0a8 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -232,7 +232,7 @@ def create_weights( params_dtype: torch.dtype, **extra_weight_attrs, ) -> None: - # gptqmodel per module/layer dynamic config my override/change base model quant config + # gptqmodel's dynamic config per module may override base model quant config self.quant_config.update_config(prefix=self.prefix) output_size_per_partition = sum(output_partition_sizes) From f902b2d08868892451ac769844eed6ee50bcd74e Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 18:01:32 +0800 Subject: [PATCH 20/56] cleanup --- .../layers/quantization/gptq_marlin.py | 56 ++++++++++--------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index ad188ee54f8d..3d95bf5cf9a3 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -43,13 +43,13 @@ class GPTQMarlinConfig(QuantizationConfig): def __init__(self, weight_bits: int, group_size: int, desc_act: bool, is_sym: bool, lm_head_quantized: bool, - dynamic: Dict[str, Dict[str, Union[int, bool]]]) -> None: + dynamic_cfg: Dict[str, Dict[str, Union[int, bool]]]) -> None: if desc_act and group_size == -1: # In this case, act_order == True is the same as act_order == False # (since we have only one group per output channel) desc_act = False - self.dynamic = dynamic + self.dynamic_cfg = dynamic_cfg self.weight_bits = weight_bits self.is_sym = is_sym @@ -66,21 +66,21 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool, def update_config(self, prefix: str): bits = self.weight_bits - # check for variable/dynamic config - if self.dynamic and len(self.dynamic) > 0 and prefix: - b = self.dynamic_get(prefix, "bits", bits) - if isinstance(b, int): - bits = b - group_size = self.dynamic_get(prefix, "group_size", - self.group_size) - if isinstance(group_size, int): - self.group_size = group_size - desc_act = self.dynamic_get(prefix, "desc_act", self.desc_act) - if isinstance(desc_act, bool): - self.desc_act = desc_act - is_sym = self.dynamic_get(prefix, "sym", self.is_sym) - if isinstance(is_sym, bool): - self.is_sym = is_sym + + b = self.gptqmodel_dynamic_config(prefix, "bits", bits) + if isinstance(b, int): + bits = b + group_size = self.gptqmodel_dynamic_config(prefix, "group_size", + self.group_size) + if isinstance(group_size, int): + self.group_size = group_size + desc_act = self.gptqmodel_dynamic_config(prefix, "desc_act", + self.desc_act) + if isinstance(desc_act, bool): + self.desc_act = desc_act + is_sym = self.gptqmodel_dynamic_config(prefix, "sym", self.is_sym) + if isinstance(is_sym, bool): + self.is_sym = is_sym self.pack_factor = 32 // bits # packed into int32 if (bits, self.is_sym) not in self.TYPE_MAP: @@ -94,7 +94,7 @@ def __repr__(self) -> str: f"group_size={self.group_size}, " f"desc_act={self.desc_act}, " f"lm_head_quantized={self.lm_head_quantized}), " - f"dynamic={self.dynamic}") + f"dynamic_cfg={self.dynamic_cfg}") @classmethod def get_name(cls) -> str: @@ -114,7 +114,7 @@ def get_config_filenames(cls) -> List[str]: @classmethod def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig": - dynamic = cls.get_from_keys_or(config, ["dynamic"], default={}) + dynamic_cfg = cls.get_from_keys_or(config, ["dynamic"], default={}) weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) desc_act = cls.get_from_keys(config, ["desc_act"]) @@ -122,7 +122,7 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig": lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) return cls(weight_bits, group_size, desc_act, is_sym, - lm_head_quantized, dynamic) + lm_head_quantized, dynamic_cfg) @classmethod def override_quantization_method(cls, hf_quant_cfg, @@ -145,13 +145,13 @@ def override_quantization_method(cls, hf_quant_cfg, " faster inference") return None - def dynamic_get( + def gptqmodel_dynamic_config( self, layer_name: str, key: Optional[str] = None, default_value: Union[int, bool, None] = None ) -> Union[Dict, int, bool, None]: - for pattern, pattern_dict in self.dynamic.items(): + for pattern, pattern_dict in self.dynamic_cfg.items(): if pattern.startswith("-:"): if re.match(pattern.removeprefix("-:"), layer_name): return False @@ -168,8 +168,8 @@ def get_quant_method( UnquantizedLinearMethod]]: if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead) and self.lm_head_quantized): - if self.dynamic: - result = self.dynamic_get(layer_name=prefix) + if self.dynamic_cfg: + result = self.gptqmodel_dynamic_config(layer_name=prefix) if result is not None and not result: return UnquantizedLinearMethod() @@ -218,6 +218,11 @@ def __init__(self, quant_config: GPTQMarlinConfig, prefix: str) -> None: self.quant_config = deepcopy(quant_config) self.prefix = prefix + if len(self.quant_config.dynamic_cfg) > 0 and self.prefix: + # gptqmodel per module/layer dynamic config my override/change base + # model quant config + self.quant_config.update_config(prefix=self.prefix) + # Verify supported on platform. verify_marlin_supported(quant_type=self.quant_config.quant_type, group_size=self.quant_config.group_size) @@ -232,9 +237,6 @@ def create_weights( params_dtype: torch.dtype, **extra_weight_attrs, ) -> None: - # gptqmodel per module/layer dynamic config my override/change base model quant config - self.quant_config.update_config(prefix=self.prefix) - output_size_per_partition = sum(output_partition_sizes) is_row_parallel = input_size != input_size_per_partition weight_loader = extra_weight_attrs.get("weight_loader") From 9b9d7e3fb359c60ebe28be26133c9fa3ceb63958 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Tue, 24 Dec 2024 18:04:33 +0800 Subject: [PATCH 21/56] Update gptq_marlin.py --- vllm/model_executor/layers/quantization/gptq_marlin.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 4a67b6ebd3c4..a369dc99b7ee 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -152,9 +152,11 @@ def gptqmodel_dynamic_config( default_value: Union[int, bool, None] = None ) -> Union[Dict, int, bool, None]: for pattern, pattern_dict in self.dynamic_cfg.items(): + # negative match: matched modules are excluded from quantization if pattern.startswith("-:"): if re.match(pattern.removeprefix("-:"), layer_name): return False + # positive match: matched modules have quant properties overriding base quant config elif re.match(pattern.removeprefix("+:"), layer_name): if key is None: return pattern_dict From 055913782013a6c58482ee33008187ce5c4a5a11 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 18:09:17 +0800 Subject: [PATCH 22/56] cleanup --- .../layers/quantization/gptq_marlin.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 4a67b6ebd3c4..5be7b8a0c683 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -41,9 +41,15 @@ class GPTQMarlinConfig(QuantizationConfig): (8, True): scalar_types.uint8b128, } - def __init__(self, weight_bits: int, group_size: int, desc_act: bool, - is_sym: bool, lm_head_quantized: bool, - dynamic_cfg: Dict[str, Dict[str, Union[int, bool]]]) -> None: + def __init__( + self, + weight_bits: int, + group_size: int, + desc_act: bool, + is_sym: bool, + lm_head_quantized: bool, + dynamic_cfg: Dict[str, Dict[str, Union[int, bool]]], + ) -> None: if desc_act and group_size == -1: # In this case, act_order == True is the same as act_order == False # (since we have only one group per output channel) @@ -145,7 +151,7 @@ def override_quantization_method(cls, hf_quant_cfg, " faster inference") return None - def gptqmodel_dynamic_config( + def get_dynamic_config( self, layer_name: str, key: Optional[str] = None, @@ -219,7 +225,7 @@ def __init__(self, quant_config: GPTQMarlinConfig, prefix: str) -> None: self.prefix = prefix if len(self.quant_config.dynamic_cfg) > 0 and self.prefix: - # gptqmodel per module/layer dynamic config my override/change base + # gptqmodel per module/layer dynamic_cfg my override/change base # model quant config self.quant_config.update_config(prefix=self.prefix) @@ -237,9 +243,6 @@ def create_weights( params_dtype: torch.dtype, **extra_weight_attrs, ) -> None: - # gptqmodel per module/layer dynamic config my override/change base model quant config - self.quant_config.update_config(prefix=self.prefix) - output_size_per_partition = sum(output_partition_sizes) is_row_parallel = input_size != input_size_per_partition weight_loader = extra_weight_attrs.get("weight_loader") From 3a2bb94b5881966b380258062fb52b914d66893c Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 18:27:42 +0800 Subject: [PATCH 23/56] cleanup --- .../layers/quantization/gptq_marlin.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index eff4aacc93e1..8b8fec76fbe5 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -73,18 +73,17 @@ def __init__( def update_config(self, prefix: str): bits = self.weight_bits - b = self.gptqmodel_dynamic_config(prefix, "bits", bits) + b = self.get_dynamic_config(prefix, "bits", bits) if isinstance(b, int): bits = b - group_size = self.gptqmodel_dynamic_config(prefix, "group_size", - self.group_size) + group_size = self.get_dynamic_config(prefix, "group_size", + self.group_size) if isinstance(group_size, int): self.group_size = group_size - desc_act = self.gptqmodel_dynamic_config(prefix, "desc_act", - self.desc_act) + desc_act = self.get_dynamic_config(prefix, "desc_act", self.desc_act) if isinstance(desc_act, bool): self.desc_act = desc_act - is_sym = self.gptqmodel_dynamic_config(prefix, "sym", self.is_sym) + is_sym = self.get_dynamic_config(prefix, "sym", self.is_sym) if isinstance(is_sym, bool): self.is_sym = is_sym @@ -162,7 +161,8 @@ def get_dynamic_config( if pattern.startswith("-:"): if re.match(pattern.removeprefix("-:"), layer_name): return False - # positive match: matched modules have quant properties overriding base quant config + # positive match: matched modules have quant properties overriding + # base quant config elif re.match(pattern.removeprefix("+:"), layer_name): if key is None: return pattern_dict @@ -177,7 +177,7 @@ def get_quant_method( if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead) and self.lm_head_quantized): if self.dynamic_cfg: - result = self.gptqmodel_dynamic_config(layer_name=prefix) + result = self.get_dynamic_config(layer_name=prefix) if result is not None and not result: return UnquantizedLinearMethod() From 3c0d45aad38f4896598834e3390f0948ed52070a Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 18:29:11 +0800 Subject: [PATCH 24/56] cleanup --- vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 8b8fec76fbe5..66574127b985 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -176,7 +176,7 @@ def get_quant_method( UnquantizedLinearMethod]]: if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead) and self.lm_head_quantized): - if self.dynamic_cfg: + if len(self.dynamic_cfg) > 0: result = self.get_dynamic_config(layer_name=prefix) if result is not None and not result: return UnquantizedLinearMethod() From 74b1d4223e93d45e678a0d6efd5ed3965eedca63 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 20:41:00 +0800 Subject: [PATCH 25/56] add test_gptq_dynamic_cfg.py --- tests/quantization/test_gptq_dynamic_cfg.py | 46 +++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 tests/quantization/test_gptq_dynamic_cfg.py diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py new file mode 100644 index 000000000000..7e0b9d88a9cf --- /dev/null +++ b/tests/quantization/test_gptq_dynamic_cfg.py @@ -0,0 +1,46 @@ +"""Tests whether gptq models with dynamic_cfg quantized can be loaded. + +Run `pytest tests/quantization/test_gptq_dynamic_cfg.py --forked`. +""" + +import pytest +import torch + +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinLinearMethod) +from vllm.model_executor.layers.linear import UnquantizedLinearMethod + +PROMPT = "On the surface of Mars, we found" + +MODELS_QUANT = ["ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg"] + + +@pytest.mark.parametrize("model_id", MODELS_QUANT) +def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str): + vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) + + for name, submodule in (vllm_model.model.llm_engine.model_executor. + driver_worker.model_runner.model.named_modules()): + if name == 'model.model.layers.0.self_attn.qkv_proj': + # The first layer is quantized using bits=4, group_size=128, + # desc_act=True + assert isinstance(submodule, GPTQMarlinLinearMethod) + assert submodule.quant_config.bits == 4 + assert submodule.quant_config.group_size == 128 + assert submodule.quant_config.desc_act + elif name == 'model.model.layers.1.self_attn.qkv_proj': + # The second layer is quantized using bits=8, group_size=32, + # desc_act=False + assert isinstance(submodule, GPTQMarlinLinearMethod) + assert submodule.quant_config.bits == 8 + assert submodule.quant_config.group_size == 32 + assert not submodule.quant_config.desc_act + elif (name == 'model.model.layers.2.self_attn.qkv_proj' + or name == 'model.model.layers.2.mlp.gate_up_proj'): + # Other layers are not quantized. + assert isinstance(submodule, UnquantizedLinearMethod) + + print( + vllm_model.generate_greedy(prompts=["Hello my name is"], + max_tokens=10)[0][1]) + del vllm_model From b0672aea6006d3168ccfbb00ad44e56ee1adeb43 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 20:41:25 +0800 Subject: [PATCH 26/56] cleanup --- tests/quantization/test_gptq_dynamic_cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py index 7e0b9d88a9cf..9a7ac159dbed 100644 --- a/tests/quantization/test_gptq_dynamic_cfg.py +++ b/tests/quantization/test_gptq_dynamic_cfg.py @@ -6,9 +6,9 @@ import pytest import torch +from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.quantization.gptq_marlin import ( GPTQMarlinLinearMethod) -from vllm.model_executor.layers.linear import UnquantizedLinearMethod PROMPT = "On the surface of Mars, we found" From 066f4898a1974baf063ee46eb902f6bb72dafe37 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Tue, 24 Dec 2024 21:05:38 +0800 Subject: [PATCH 27/56] Update test_gptq_dynamic_cfg.py --- tests/quantization/test_gptq_dynamic_cfg.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py index 9a7ac159dbed..86c290373ed4 100644 --- a/tests/quantization/test_gptq_dynamic_cfg.py +++ b/tests/quantization/test_gptq_dynamic_cfg.py @@ -12,6 +12,9 @@ PROMPT = "On the surface of Mars, we found" +# The first layer is quantized using bits=4, group_size=128 +# The second layer is quantized using bits=8, group_size=32 +# All other layers (layer index >= 2) are not quantized MODELS_QUANT = ["ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg"] From 6dc56a69088e09a36b0c9b2ba119c68366f1682d Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Tue, 24 Dec 2024 21:07:32 +0800 Subject: [PATCH 28/56] Update test_gptq_dynamic_cfg.py --- tests/quantization/test_gptq_dynamic_cfg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py index 86c290373ed4..7f2ad3dfa854 100644 --- a/tests/quantization/test_gptq_dynamic_cfg.py +++ b/tests/quantization/test_gptq_dynamic_cfg.py @@ -25,14 +25,14 @@ def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str): for name, submodule in (vllm_model.model.llm_engine.model_executor. driver_worker.model_runner.model.named_modules()): if name == 'model.model.layers.0.self_attn.qkv_proj': - # The first layer is quantized using bits=4, group_size=128, + # The first layer is quantized using bits=4, group_size=128 # desc_act=True assert isinstance(submodule, GPTQMarlinLinearMethod) assert submodule.quant_config.bits == 4 assert submodule.quant_config.group_size == 128 assert submodule.quant_config.desc_act elif name == 'model.model.layers.1.self_attn.qkv_proj': - # The second layer is quantized using bits=8, group_size=32, + # The second layer is quantized using bits=8, group_size=32 # desc_act=False assert isinstance(submodule, GPTQMarlinLinearMethod) assert submodule.quant_config.bits == 8 @@ -40,7 +40,7 @@ def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str): assert not submodule.quant_config.desc_act elif (name == 'model.model.layers.2.self_attn.qkv_proj' or name == 'model.model.layers.2.mlp.gate_up_proj'): - # Other layers are not quantized. + # All other layers (layer index >= 2) are not quantized assert isinstance(submodule, UnquantizedLinearMethod) print( From 98a198e0274713e824d301711a661520f15044d3 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 21:08:48 +0800 Subject: [PATCH 29/56] cleanup --- tests/quantization/test_gptq_dynamic_cfg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py index 9a7ac159dbed..72403c68aa29 100644 --- a/tests/quantization/test_gptq_dynamic_cfg.py +++ b/tests/quantization/test_gptq_dynamic_cfg.py @@ -12,10 +12,10 @@ PROMPT = "On the surface of Mars, we found" -MODELS_QUANT = ["ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg"] +MODEL_QUANT = ["ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg"] -@pytest.mark.parametrize("model_id", MODELS_QUANT) +@pytest.mark.parametrize("model_id", MODEL_QUANT) def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str): vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) From c4a29eb5d50217605b295815230803e046339090 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 21:19:45 +0800 Subject: [PATCH 30/56] use PROMPT variable --- tests/quantization/test_gptq_dynamic_cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py index 9e1ce642d02d..c00498ab4f33 100644 --- a/tests/quantization/test_gptq_dynamic_cfg.py +++ b/tests/quantization/test_gptq_dynamic_cfg.py @@ -44,6 +44,6 @@ def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str): assert isinstance(submodule, UnquantizedLinearMethod) print( - vllm_model.generate_greedy(prompts=["Hello my name is"], + vllm_model.generate_greedy(prompts=[PROMPT], max_tokens=10)[0][1]) del vllm_model From 25703e3eab4e123a4e3737ab8244e99263287257 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 24 Dec 2024 21:22:53 +0800 Subject: [PATCH 31/56] cleanup --- tests/quantization/test_gptq_dynamic_cfg.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py index c00498ab4f33..abc48e786bd8 100644 --- a/tests/quantization/test_gptq_dynamic_cfg.py +++ b/tests/quantization/test_gptq_dynamic_cfg.py @@ -43,7 +43,5 @@ def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str): # All other layers (layer index >= 2) are not quantized assert isinstance(submodule, UnquantizedLinearMethod) - print( - vllm_model.generate_greedy(prompts=[PROMPT], - max_tokens=10)[0][1]) + print(vllm_model.generate_greedy(prompts=[PROMPT], max_tokens=10)[0][1]) del vllm_model From 070ae3c53985fa4fdd0debcf7d4d24d5aba66961 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Thu, 6 Feb 2025 17:29:14 +0800 Subject: [PATCH 32/56] rename method and add detailed comments --- .../layers/quantization/gptq_marlin.py | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 51e2d5bfeb38..2044d6d5591a 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -58,7 +58,25 @@ def __init__( # (since we have only one group per output channel) desc_act = False + # GPTQModel use `dynamic_cfg` to allow per module quantization config so each module + # can be optmized for its own unique quant errors. Format is Dict[str, Dict] where key + # is a regex string that can both positive ("+:" prefixed) or negative ("-:" prefixed) match + # a module. Default to postiive match (override base quant config mode) if no prefix. + # Value is in dict format of field key and override value. Negative matching will skip + # quantization init for this module entirely (non-quantized inference). + # More details and quantize examples can be found at: https://github.com/ModelCloud/GPTQModel + # Example: + # # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9 + # # last 1/4 of the layers 16-21 has 8bit and group_size 64 + # dynamic_cfg = { + # #`.*\.` matches the layers_node prefix + # r"+:.*\.(?:1[0-5])\..*": {"bits": 8,}, # positive match layer 10-15 + # r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,}, # positive match layer 16-21 + # r"-:.*\.moe\..*": {}, # negative match all `moe` layers + #} self.dynamic_cfg = dynamic_cfg + + self.weight_bits = weight_bits self.is_sym = is_sym @@ -73,7 +91,8 @@ def __init__( self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)] - def update_config(self, prefix: str): + # match dynamic rules with module name (prefix) and apply quantize config overrides if matched + def override_config(self, prefix: str): bits = self.weight_bits b = self.get_dynamic_config(prefix, "bits", bits) @@ -232,7 +251,7 @@ def __init__(self, quant_config: GPTQMarlinConfig, prefix: str) -> None: if len(self.quant_config.dynamic_cfg) > 0 and self.prefix: # gptqmodel per module/layer dynamic_cfg my override/change base # model quant config - self.quant_config.update_config(prefix=self.prefix) + self.quant_config.override_config(prefix=self.prefix) # Verify supported on platform. verify_marlin_supported(quant_type=self.quant_config.quant_type, From 13b2b7ba4e8250c86f14624b69e7e9d07a0750d1 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 7 Feb 2025 02:54:31 +0000 Subject: [PATCH 33/56] Changed VocabParallelEmbedding.linear_method to quant_method to be consistent with LinearBase. --- tests/quantization/test_lm_head.py | 4 ++-- vllm/lora/layers.py | 2 +- .../model_executor/layers/logits_processor.py | 2 +- .../layers/quantization/gptq_marlin.py | 10 ++++----- .../layers/vocab_parallel_embedding.py | 22 +++++++++---------- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index ec60d8a57559..2fe3d2f1a867 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -37,11 +37,11 @@ def check_model(model): lm_head_layer = model.lm_head if lm_head_quantized: - assert isinstance(lm_head_layer.linear_method, + assert isinstance(lm_head_layer.quant_method, (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod)) else: - assert isinstance(lm_head_layer.linear_method, + assert isinstance(lm_head_layer.quant_method, UnquantizedEmbeddingMethod) vllm_model.apply_model(check_model) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 9f0297596ccb..04e5dc8752b6 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1040,7 +1040,7 @@ def _get_logits( embedding_bias: Optional[torch.Tensor] = None, ) -> Optional[torch.Tensor]: # Get the logits for the next tokens. - logits = lm_head.linear_method.apply(lm_head, hidden_states) + logits = lm_head.quant_method.apply(lm_head, hidden_states) if embedding_bias is not None: logits += embedding_bias logits = tensor_model_parallel_gather(logits) diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index cdc67ca83d48..a3d5e6da2683 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -95,7 +95,7 @@ def _get_logits( embedding_bias: Optional[torch.Tensor], ) -> Optional[torch.Tensor]: # Get the logits for the next tokens. - logits = lm_head.linear_method.apply(lm_head, + logits = lm_head.quant_method.apply(lm_head, hidden_states, bias=embedding_bias) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 51e2d5bfeb38..97bc3dad001f 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -23,7 +23,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_marlin_supported, marlin_moe_permute_scales, marlin_repeat_scales_on_all_ranks, verify_marlin_supported) -from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead, UnquantizedEmbeddingMethod from vllm.model_executor.parameter import (ChannelQuantScaleParameter, GroupQuantScaleParameter, PackedColumnParameter, @@ -176,13 +176,13 @@ def get_dynamic_config( def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod", - UnquantizedLinearMethod]]: - if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead) - and self.lm_head_quantized): + UnquantizedLinearMethod, UnquantizedEmbeddingMethod]]: + lm_head_quantized = isinstance(layer, ParallelLMHead) and self.lm_head_quantized + if isinstance(layer, LinearBase) or lm_head_quantized: if len(self.dynamic_cfg) > 0: result = self.get_dynamic_config(layer_name=prefix) if result is not None and not result: - return UnquantizedLinearMethod() + return UnquantizedEmbeddingMethod() if lm_head_quantized else UnquantizedLinearMethod() return GPTQMarlinLinearMethod(self, prefix=prefix) elif isinstance(layer, FusedMoE): diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index e409094dd535..85081a26c149 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -226,24 +226,24 @@ def __init__(self, self.tp_size) self.embedding_dim = embedding_dim - linear_method = None + quant_method = None if quant_config is not None: - linear_method = quant_config.get_quant_method(self, prefix=prefix) - if linear_method is None: - linear_method = UnquantizedEmbeddingMethod() + quant_method = quant_config.get_quant_method(self, prefix=prefix) + if quant_method is None: + quant_method = UnquantizedEmbeddingMethod() # If we are making an embedding layer, then our quantization linear # method must implement the embedding operation. If we are another # layer type like ParallelLMHead, this is not important. is_embedding_layer = type(self.__class__) is VocabParallelEmbedding - linear_method_implements_embedding = method_has_implemented_embedding( - type(linear_method)) - if is_embedding_layer and not linear_method_implements_embedding: + quant_method_implements_embedding = method_has_implemented_embedding( + type(quant_method)) + if is_embedding_layer and not quant_method_implements_embedding: raise NotImplementedError( - f"The class {type(linear_method).__name__} must implement " + f"The class {type(quant_method).__name__} must implement " "the 'embedding' method, see UnquantizedEmbeddingMethod.") - self.linear_method: QuantizeMethodBase = linear_method + self.quant_method: QuantizeMethodBase = quant_method if params_dtype is None: params_dtype = torch.get_default_dtype() @@ -260,7 +260,7 @@ def __init__(self, self.shard_indices.added_vocab_end_index - self.shard_indices.added_vocab_start_index) - self.linear_method.create_weights(self, + self.quant_method.create_weights(self, self.embedding_dim, [self.num_embeddings_per_partition], self.embedding_dim, @@ -412,7 +412,7 @@ def forward(self, input_): else: masked_input = input_ # Get the embeddings. - output_parallel = self.linear_method.embedding(self, + output_parallel = self.quant_method.embedding(self, masked_input.long()) # Mask the output embedding. if self.tp_size > 1: From 40562d16b2ea77381428b61ecd10e164572e39f1 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 7 Feb 2025 04:57:36 +0000 Subject: [PATCH 34/56] fix unittest Signed-off-by: ZX-ModelCloud --- tests/quantization/test_gptq_dynamic_cfg.py | 37 ++++++++++------- tests/quantization/test_lm_head.py | 14 ++++--- .../layers/quantization/gptq_marlin.py | 40 +++++++++++-------- 3 files changed, 56 insertions(+), 35 deletions(-) diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic_cfg.py index abc48e786bd8..419eaae1c6cd 100644 --- a/tests/quantization/test_gptq_dynamic_cfg.py +++ b/tests/quantization/test_gptq_dynamic_cfg.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: Apache-2.0 """Tests whether gptq models with dynamic_cfg quantized can be loaded. Run `pytest tests/quantization/test_gptq_dynamic_cfg.py --forked`. @@ -15,7 +16,9 @@ # The first layer is quantized using bits=4, group_size=128 # The second layer is quantized using bits=8, group_size=32 # All other layers (layer index >= 2) are not quantized -MODEL_QUANT = ["ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg"] +MODEL_QUANT = [ + "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg-with-lm_head" +] @pytest.mark.parametrize("model_id", MODEL_QUANT) @@ -24,24 +27,30 @@ def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str): for name, submodule in (vllm_model.model.llm_engine.model_executor. driver_worker.model_runner.model.named_modules()): - if name == 'model.model.layers.0.self_attn.qkv_proj': + if name == "lm_head": + assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod) + elif name == 'model.layers.0.self_attn.qkv_proj': # The first layer is quantized using bits=4, group_size=128 # desc_act=True - assert isinstance(submodule, GPTQMarlinLinearMethod) - assert submodule.quant_config.bits == 4 - assert submodule.quant_config.group_size == 128 - assert submodule.quant_config.desc_act - elif name == 'model.model.layers.1.self_attn.qkv_proj': + assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod) + config = submodule.quant_method.quant_config + assert config.weight_bits == 4 + assert config.group_size == 128 + assert config.desc_act + elif name == 'model.layers.1.self_attn.qkv_proj': # The second layer is quantized using bits=8, group_size=32 # desc_act=False - assert isinstance(submodule, GPTQMarlinLinearMethod) - assert submodule.quant_config.bits == 8 - assert submodule.quant_config.group_size == 32 - assert not submodule.quant_config.desc_act - elif (name == 'model.model.layers.2.self_attn.qkv_proj' - or name == 'model.model.layers.2.mlp.gate_up_proj'): + assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod) + config = submodule.quant_method.quant_config + assert config.get_dynamic_config(layer_name=name, key="bits") == 8 + assert config.get_dynamic_config(layer_name=name, + key="group_size") == 32 + assert not config.get_dynamic_config(layer_name=name, + key="desc_act") + elif (name == 'model.layers.2.self_attn.qkv_proj' + or name == 'model.layers.2.mlp.gate_up_proj'): # All other layers (layer index >= 2) are not quantized - assert isinstance(submodule, UnquantizedLinearMethod) + assert isinstance(submodule.quant_method, UnquantizedLinearMethod) print(vllm_model.generate_greedy(prompts=[PROMPT], max_tokens=10)[0][1]) del vllm_model diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index 2fe3d2f1a867..246eff2588b9 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -17,10 +17,13 @@ PROMPT = "On the surface of Mars, we found" -MODELS_QUANT = [( - "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse", - True), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False), - ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)] +MODELS_QUANT = [ + ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg-with-lm_head", + True), + ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False), + ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False), + ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False) +] @pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT) @@ -35,7 +38,8 @@ def test_lm_head( def check_model(model): lm_head_layer = model.lm_head - + print("lm_head_layer.quant_method", model, + lm_head_layer.quant_method) if lm_head_quantized: assert isinstance(lm_head_layer.quant_method, (GPTQLinearMethod, GPTQMarlinLinearMethod, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 94ad1443fb75..746371814236 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -2,7 +2,6 @@ import re from copy import deepcopy - from typing import Any, Callable, Dict, List, Optional, Set, Union import torch @@ -23,7 +22,8 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_marlin_supported, marlin_moe_permute_scales, marlin_repeat_scales_on_all_ranks, verify_marlin_supported) -from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead, UnquantizedEmbeddingMethod +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, UnquantizedEmbeddingMethod) from vllm.model_executor.parameter import (ChannelQuantScaleParameter, GroupQuantScaleParameter, PackedColumnParameter, @@ -58,25 +58,28 @@ def __init__( # (since we have only one group per output channel) desc_act = False - # GPTQModel use `dynamic_cfg` to allow per module quantization config so each module - # can be optmized for its own unique quant errors. Format is Dict[str, Dict] where key - # is a regex string that can both positive ("+:" prefixed) or negative ("-:" prefixed) match - # a module. Default to postiive match (override base quant config mode) if no prefix. - # Value is in dict format of field key and override value. Negative matching will skip - # quantization init for this module entirely (non-quantized inference). - # More details and quantize examples can be found at: https://github.com/ModelCloud/GPTQModel + # GPTQModel use `dynamic_cfg` to allow per module quantization config + # so each module can be optimized for its own unique quant errors. + # Format is Dict[str, Dict] where key is a regex string that can both + # positive ("+:" prefixed) or negative ("-:" prefixed) match a module. + # Default to positive match (override base quant config mode) if no + # prefix. Value is in dict format of field key and override value. + # Negative matching will skip quantization init for this module entirely + # (non-quantized inference). More details and quantize examples can be + # found at: https://github.com/ModelCloud/GPTQModel # Example: # # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9 # # last 1/4 of the layers 16-21 has 8bit and group_size 64 # dynamic_cfg = { # #`.*\.` matches the layers_node prefix - # r"+:.*\.(?:1[0-5])\..*": {"bits": 8,}, # positive match layer 10-15 - # r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,}, # positive match layer 16-21 + # # positive match layer 10-15 + # r"+:.*\.(?:1[0-5])\..*": {"bits": 8,}, + # # positive match layer 16-21 + # r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,}, # r"-:.*\.moe\..*": {}, # negative match all `moe` layers - #} + # } self.dynamic_cfg = dynamic_cfg - self.weight_bits = weight_bits self.is_sym = is_sym @@ -91,7 +94,8 @@ def __init__( self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)] - # match dynamic rules with module name (prefix) and apply quantize config overrides if matched + # match dynamic rules with module name (prefix) and apply to quantize + # config overrides if matched def override_config(self, prefix: str): bits = self.weight_bits @@ -142,6 +146,8 @@ def get_config_filenames(cls) -> List[str]: @classmethod def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig": dynamic_cfg = cls.get_from_keys_or(config, ["dynamic"], default={}) + if dynamic_cfg is None: + dynamic_cfg = {} weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) desc_act = cls.get_from_keys(config, ["desc_act"]) @@ -196,12 +202,14 @@ def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod", UnquantizedLinearMethod, UnquantizedEmbeddingMethod]]: - lm_head_quantized = isinstance(layer, ParallelLMHead) and self.lm_head_quantized + lm_head_quantized = isinstance( + layer, ParallelLMHead) and self.lm_head_quantized if isinstance(layer, LinearBase) or lm_head_quantized: if len(self.dynamic_cfg) > 0: result = self.get_dynamic_config(layer_name=prefix) if result is not None and not result: - return UnquantizedEmbeddingMethod() if lm_head_quantized else UnquantizedLinearMethod() + return UnquantizedEmbeddingMethod( + ) if lm_head_quantized else UnquantizedLinearMethod() return GPTQMarlinLinearMethod(self, prefix=prefix) elif isinstance(layer, FusedMoE): From 7b774bbbd7044f7e2e2146b8764816f51394309c Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 7 Feb 2025 04:59:13 +0000 Subject: [PATCH 35/56] cleanup Signed-off-by: ZX-ModelCloud --- tests/quantization/test_lm_head.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index 246eff2588b9..812d3d6825a6 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -38,8 +38,6 @@ def test_lm_head( def check_model(model): lm_head_layer = model.lm_head - print("lm_head_layer.quant_method", model, - lm_head_layer.quant_method) if lm_head_quantized: assert isinstance(lm_head_layer.quant_method, (GPTQLinearMethod, GPTQMarlinLinearMethod, From c72125a3f8f4652f7f8792eee3a7eeb825912cd9 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 7 Feb 2025 06:54:08 +0000 Subject: [PATCH 36/56] cleanup Signed-off-by: ZX-ModelCloud --- ...tq_dynamic_cfg.py => test_gptq_dynamic.py} | 16 ++++---- .../layers/quantization/gptq_marlin.py | 40 +++++++++---------- 2 files changed, 28 insertions(+), 28 deletions(-) rename tests/quantization/{test_gptq_dynamic_cfg.py => test_gptq_dynamic.py} (78%) diff --git a/tests/quantization/test_gptq_dynamic_cfg.py b/tests/quantization/test_gptq_dynamic.py similarity index 78% rename from tests/quantization/test_gptq_dynamic_cfg.py rename to tests/quantization/test_gptq_dynamic.py index 419eaae1c6cd..6c177e1be0f4 100644 --- a/tests/quantization/test_gptq_dynamic_cfg.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -"""Tests whether gptq models with dynamic_cfg quantized can be loaded. +"""Tests whether gptq models with dynamic quantized can be loaded. -Run `pytest tests/quantization/test_gptq_dynamic_cfg.py --forked`. +Run `pytest tests/quantization/test_gptq_dynamic.py --forked`. """ import pytest @@ -22,7 +22,7 @@ @pytest.mark.parametrize("model_id", MODEL_QUANT) -def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str): +def test_gptq_with_dynamic(vllm_runner, model_id: str): vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) for name, submodule in (vllm_model.model.llm_engine.model_executor. @@ -42,11 +42,11 @@ def test_gptq_with_dynamic_cfg(vllm_runner, model_id: str): # desc_act=False assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod) config = submodule.quant_method.quant_config - assert config.get_dynamic_config(layer_name=name, key="bits") == 8 - assert config.get_dynamic_config(layer_name=name, - key="group_size") == 32 - assert not config.get_dynamic_config(layer_name=name, - key="desc_act") + assert config.get_dynamic_value(layer_name=name, key="bits") == 8 + assert config.get_dynamic_value(layer_name=name, + key="group_size") == 32 + assert not config.get_dynamic_value(layer_name=name, + key="desc_act") elif (name == 'model.layers.2.self_attn.qkv_proj' or name == 'model.layers.2.mlp.gate_up_proj'): # All other layers (layer index >= 2) are not quantized diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 746371814236..2b24bdbae2f7 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -51,14 +51,14 @@ def __init__( desc_act: bool, is_sym: bool, lm_head_quantized: bool, - dynamic_cfg: Dict[str, Dict[str, Union[int, bool]]], + dynamic: Dict[str, Dict[str, Union[int, bool]]], ) -> None: if desc_act and group_size == -1: # In this case, act_order == True is the same as act_order == False # (since we have only one group per output channel) desc_act = False - # GPTQModel use `dynamic_cfg` to allow per module quantization config + # GPTQModel use `dynamic` to allow per module quantization config # so each module can be optimized for its own unique quant errors. # Format is Dict[str, Dict] where key is a regex string that can both # positive ("+:" prefixed) or negative ("-:" prefixed) match a module. @@ -70,7 +70,7 @@ def __init__( # Example: # # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9 # # last 1/4 of the layers 16-21 has 8bit and group_size 64 - # dynamic_cfg = { + # dynamic = { # #`.*\.` matches the layers_node prefix # # positive match layer 10-15 # r"+:.*\.(?:1[0-5])\..*": {"bits": 8,}, @@ -78,7 +78,7 @@ def __init__( # r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,}, # r"-:.*\.moe\..*": {}, # negative match all `moe` layers # } - self.dynamic_cfg = dynamic_cfg + self.dynamic = dynamic self.weight_bits = weight_bits self.is_sym = is_sym @@ -99,17 +99,17 @@ def __init__( def override_config(self, prefix: str): bits = self.weight_bits - b = self.get_dynamic_config(prefix, "bits", bits) + b = self.get_dynamic_value(prefix, "bits", bits) if isinstance(b, int): bits = b - group_size = self.get_dynamic_config(prefix, "group_size", - self.group_size) + group_size = self.get_dynamic_value(prefix, "group_size", + self.group_size) if isinstance(group_size, int): self.group_size = group_size - desc_act = self.get_dynamic_config(prefix, "desc_act", self.desc_act) + desc_act = self.get_dynamic_value(prefix, "desc_act", self.desc_act) if isinstance(desc_act, bool): self.desc_act = desc_act - is_sym = self.get_dynamic_config(prefix, "sym", self.is_sym) + is_sym = self.get_dynamic_value(prefix, "sym", self.is_sym) if isinstance(is_sym, bool): self.is_sym = is_sym @@ -125,7 +125,7 @@ def __repr__(self) -> str: f"group_size={self.group_size}, " f"desc_act={self.desc_act}, " f"lm_head_quantized={self.lm_head_quantized}), " - f"dynamic_cfg={self.dynamic_cfg}") + f"dynamic={self.dynamic}") @classmethod def get_name(cls) -> str: @@ -145,9 +145,9 @@ def get_config_filenames(cls) -> List[str]: @classmethod def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig": - dynamic_cfg = cls.get_from_keys_or(config, ["dynamic"], default={}) - if dynamic_cfg is None: - dynamic_cfg = {} + dynamic = cls.get_from_keys_or(config, ["dynamic"], default={}) + if dynamic is None: + dynamic = {} weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) desc_act = cls.get_from_keys(config, ["desc_act"]) @@ -155,7 +155,7 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig": lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) return cls(weight_bits, group_size, desc_act, is_sym, - lm_head_quantized, dynamic_cfg) + lm_head_quantized, dynamic) @classmethod def override_quantization_method(cls, hf_quant_cfg, @@ -178,13 +178,13 @@ def override_quantization_method(cls, hf_quant_cfg, " faster inference") return None - def get_dynamic_config( + def get_dynamic_value( self, layer_name: str, key: Optional[str] = None, default_value: Union[int, bool, None] = None ) -> Union[Dict, int, bool, None]: - for pattern, pattern_dict in self.dynamic_cfg.items(): + for pattern, pattern_dict in self.dynamic.items(): # negative match: matched modules are excluded from quantization if pattern.startswith("-:"): if re.match(pattern.removeprefix("-:"), layer_name): @@ -205,8 +205,8 @@ def get_quant_method( lm_head_quantized = isinstance( layer, ParallelLMHead) and self.lm_head_quantized if isinstance(layer, LinearBase) or lm_head_quantized: - if len(self.dynamic_cfg) > 0: - result = self.get_dynamic_config(layer_name=prefix) + if len(self.dynamic) > 0: + result = self.get_dynamic_value(layer_name=prefix) if result is not None and not result: return UnquantizedEmbeddingMethod( ) if lm_head_quantized else UnquantizedLinearMethod() @@ -256,8 +256,8 @@ def __init__(self, quant_config: GPTQMarlinConfig, prefix: str) -> None: self.quant_config = deepcopy(quant_config) self.prefix = prefix - if len(self.quant_config.dynamic_cfg) > 0 and self.prefix: - # gptqmodel per module/layer dynamic_cfg my override/change base + if len(self.quant_config.dynamic) > 0 and self.prefix: + # gptqmodel per module/layer dynamic my override/change base # model quant config self.quant_config.override_config(prefix=self.prefix) From c298195414c0f9307557c73120b1209871aac251 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 7 Feb 2025 07:18:59 +0000 Subject: [PATCH 37/56] cleanup Signed-off-by: ZX-ModelCloud --- tests/quantization/test_gptq_dynamic.py | 11 ++++++----- .../layers/quantization/gptq_marlin.py | 14 +++++++------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index 6c177e1be0f4..2262a51a5b33 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -42,11 +42,12 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str): # desc_act=False assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod) config = submodule.quant_method.quant_config - assert config.get_dynamic_value(layer_name=name, key="bits") == 8 - assert config.get_dynamic_value(layer_name=name, - key="group_size") == 32 - assert not config.get_dynamic_value(layer_name=name, - key="desc_act") + assert config.get_dynamic_override(layer_name=name, + key="bits") == 8 + assert config.get_dynamic_override(layer_name=name, + key="group_size") == 32 + assert not config.get_dynamic_override(layer_name=name, + key="desc_act") elif (name == 'model.layers.2.self_attn.qkv_proj' or name == 'model.layers.2.mlp.gate_up_proj'): # All other layers (layer index >= 2) are not quantized diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 2b24bdbae2f7..af707d0869ca 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -99,17 +99,17 @@ def __init__( def override_config(self, prefix: str): bits = self.weight_bits - b = self.get_dynamic_value(prefix, "bits", bits) + b = self.get_dynamic_override(prefix, "bits", bits) if isinstance(b, int): bits = b - group_size = self.get_dynamic_value(prefix, "group_size", - self.group_size) + group_size = self.get_dynamic_override(prefix, "group_size", + self.group_size) if isinstance(group_size, int): self.group_size = group_size - desc_act = self.get_dynamic_value(prefix, "desc_act", self.desc_act) + desc_act = self.get_dynamic_override(prefix, "desc_act", self.desc_act) if isinstance(desc_act, bool): self.desc_act = desc_act - is_sym = self.get_dynamic_value(prefix, "sym", self.is_sym) + is_sym = self.get_dynamic_override(prefix, "sym", self.is_sym) if isinstance(is_sym, bool): self.is_sym = is_sym @@ -178,7 +178,7 @@ def override_quantization_method(cls, hf_quant_cfg, " faster inference") return None - def get_dynamic_value( + def get_dynamic_override( self, layer_name: str, key: Optional[str] = None, @@ -206,7 +206,7 @@ def get_quant_method( layer, ParallelLMHead) and self.lm_head_quantized if isinstance(layer, LinearBase) or lm_head_quantized: if len(self.dynamic) > 0: - result = self.get_dynamic_value(layer_name=prefix) + result = self.get_dynamic_override(layer_name=prefix) if result is not None and not result: return UnquantizedEmbeddingMethod( ) if lm_head_quantized else UnquantizedLinearMethod() From bbc049dbce6160ccdff1e3ebe3f90ad3f415e3a9 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Fri, 7 Feb 2025 15:38:42 +0800 Subject: [PATCH 38/56] Update gptq_marlin.py --- .../layers/quantization/gptq_marlin.py | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index af707d0869ca..93e46fd63e5a 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -58,14 +58,14 @@ def __init__( # (since we have only one group per output channel) desc_act = False - # GPTQModel use `dynamic` to allow per module quantization config - # so each module can be optimized for its own unique quant errors. - # Format is Dict[str, Dict] where key is a regex string that can both - # positive ("+:" prefixed) or negative ("-:" prefixed) match a module. - # Default to positive match (override base quant config mode) if no - # prefix. Value is in dict format of field key and override value. - # Negative matching will skip quantization init for this module entirely - # (non-quantized inference). More details and quantize examples can be + # GPTQModel use `dynamic` config property to allow per module quantization + # config so each module can be individually optimized. + # Format is Dict[str, Dict] where key is a regex string that can perform both + # positive ("+:" prefixed) or negative ("-:" prefixed) matching of a module. + # Default to positive match, override base quant config mode, if no + # prefix is used. Value is in dict format of field key and override value. + # Negative matching will skip quantization init for this module entirely: + # non-quantized inference. More details and quantization examples can be # found at: https://github.com/ModelCloud/GPTQModel # Example: # # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9 @@ -76,7 +76,7 @@ def __init__( # r"+:.*\.(?:1[0-5])\..*": {"bits": 8,}, # # positive match layer 16-21 # r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,}, - # r"-:.*\.moe\..*": {}, # negative match all `moe` layers + # r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers # } self.dynamic = dynamic @@ -94,8 +94,8 @@ def __init__( self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)] - # match dynamic rules with module name (prefix) and apply to quantize - # config overrides if matched + # Match dynamic rules with module name (prefix) and override quantize + # config if module (prefix) matches a rule def override_config(self, prefix: str): bits = self.weight_bits @@ -185,11 +185,11 @@ def get_dynamic_override( default_value: Union[int, bool, None] = None ) -> Union[Dict, int, bool, None]: for pattern, pattern_dict in self.dynamic.items(): - # negative match: matched modules are excluded from quantization + # Negative match: matched modules are excluded from quantized init if pattern.startswith("-:"): if re.match(pattern.removeprefix("-:"), layer_name): return False - # positive match: matched modules have quant properties overriding + # Positive match: matched modules have quant properties overrides # base quant config elif re.match(pattern.removeprefix("+:"), layer_name): if key is None: @@ -218,7 +218,6 @@ def get_quant_method( @classmethod def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]): - # Extract data from quant config. quant_method = quant_config.get("quant_method", "").lower() num_bits = quant_config.get("bits") group_size = quant_config.get("group_size") @@ -231,7 +230,7 @@ def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]): if quant_method != "gptq": return False - # If we cannot find the info needed in the config, cannot convert. + # Marlin conversion is only valid if required properties are found if (num_bits is None or group_size is None or sym is None or desc_act is None): return False @@ -257,8 +256,7 @@ def __init__(self, quant_config: GPTQMarlinConfig, prefix: str) -> None: self.prefix = prefix if len(self.quant_config.dynamic) > 0 and self.prefix: - # gptqmodel per module/layer dynamic my override/change base - # model quant config + # Dynamic per module/layer rules may override base config self.quant_config.override_config(prefix=self.prefix) # Verify supported on platform. From 78f88183345dcc05e5078def6250de7018642f22 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 7 Feb 2025 07:41:43 +0000 Subject: [PATCH 39/56] format Signed-off-by: ZX-ModelCloud --- .../layers/quantization/gptq_marlin.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 93e46fd63e5a..61c8d9011f0d 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -58,13 +58,16 @@ def __init__( # (since we have only one group per output channel) desc_act = False - # GPTQModel use `dynamic` config property to allow per module quantization - # config so each module can be individually optimized. - # Format is Dict[str, Dict] where key is a regex string that can perform both - # positive ("+:" prefixed) or negative ("-:" prefixed) matching of a module. + # GPTQModel use `dynamic` config property to allow per module + # quantization config so each module can be individually optimized. + # Format is Dict[str, Dict] where key is a regex string that can + # perform both positive ("+:" prefixed) or negative ("-:" prefixed) + # matching of a module. # Default to positive match, override base quant config mode, if no - # prefix is used. Value is in dict format of field key and override value. - # Negative matching will skip quantization init for this module entirely: + # prefix is used. Value is in dict format of field key and override + # value. + # Negative matching will skip quantization init for this module + # entirely: # non-quantized inference. More details and quantization examples can be # found at: https://github.com/ModelCloud/GPTQModel # Example: From 93ee5762186ee1f3e92fd250e275d168a87ca4ff Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Fri, 7 Feb 2025 23:23:07 +0800 Subject: [PATCH 40/56] Update gptq_marlin.py --- vllm/model_executor/layers/quantization/gptq_marlin.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 3a616d6f307f..36d4635db956 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -210,7 +210,8 @@ def get_quant_method( if isinstance(layer, LinearBase) or lm_head_quantized: if len(self.dynamic) > 0: result = self.get_dynamic_override(layer_name=prefix) - if result is not None and not result: + # False = skip module, None = no override, else = Positive match + if result == False: return UnquantizedEmbeddingMethod( ) if lm_head_quantized else UnquantizedLinearMethod() From 6ebf85cbfc56b2f8fe5110b4de00818f1a71f47b Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Fri, 7 Feb 2025 23:36:04 +0800 Subject: [PATCH 41/56] rename to parallel_lm_head_quantized for clarity --- vllm/model_executor/layers/quantization/gptq_marlin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 36d4635db956..1b5003c1aed8 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -205,15 +205,15 @@ def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod", UnquantizedLinearMethod, UnquantizedEmbeddingMethod]]: - lm_head_quantized = isinstance( + parallel_lm_head_quantized = isinstance( layer, ParallelLMHead) and self.lm_head_quantized - if isinstance(layer, LinearBase) or lm_head_quantized: + if isinstance(layer, LinearBase) or parallel_lm_head_quantized: if len(self.dynamic) > 0: result = self.get_dynamic_override(layer_name=prefix) # False = skip module, None = no override, else = Positive match if result == False: return UnquantizedEmbeddingMethod( - ) if lm_head_quantized else UnquantizedLinearMethod() + ) if parallel_lm_head_quantized else UnquantizedLinearMethod() return GPTQMarlinLinearMethod(self, prefix=prefix) elif isinstance(layer, FusedMoE): From 59bdf54c2ee2c314fb28049fcccc7877fc602c0f Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Fri, 7 Feb 2025 23:38:31 +0800 Subject: [PATCH 42/56] simplify --- vllm/model_executor/layers/quantization/gptq_marlin.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 1b5003c1aed8..11e65e6ced2b 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -208,12 +208,10 @@ def get_quant_method( parallel_lm_head_quantized = isinstance( layer, ParallelLMHead) and self.lm_head_quantized if isinstance(layer, LinearBase) or parallel_lm_head_quantized: - if len(self.dynamic) > 0: - result = self.get_dynamic_override(layer_name=prefix) - # False = skip module, None = no override, else = Positive match - if result == False: - return UnquantizedEmbeddingMethod( - ) if parallel_lm_head_quantized else UnquantizedLinearMethod() + # False = skip module, None = no override, else = Positive match + if self.get_dynamic_override(layer_name=prefix) == False: + return UnquantizedEmbeddingMethod( + ) if parallel_lm_head_quantized else UnquantizedLinearMethod() return GPTQMarlinLinearMethod(self, prefix=prefix) elif isinstance(layer, FusedMoE): From 9de0382159e6e18a59a2b243f71b08a9224b4870 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Fri, 7 Feb 2025 23:42:48 +0800 Subject: [PATCH 43/56] shorten code --- vllm/model_executor/layers/quantization/gptq_marlin.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 11e65e6ced2b..22bd9c0a63b8 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -149,8 +149,7 @@ def get_config_filenames(cls) -> List[str]: @classmethod def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig": dynamic = cls.get_from_keys_or(config, ["dynamic"], default={}) - if dynamic is None: - dynamic = {} + dynamic = {} if dynamic is None else dynamic weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) desc_act = cls.get_from_keys(config, ["desc_act"]) From 67d0882ff4522a2e3100d7a76ec052e095a97adb Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 7 Feb 2025 16:09:31 +0000 Subject: [PATCH 44/56] cleanup Signed-off-by: ZX-ModelCloud --- .../layers/quantization/gptq_marlin.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 22bd9c0a63b8..d3a7253c7686 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -208,11 +208,17 @@ def get_quant_method( layer, ParallelLMHead) and self.lm_head_quantized if isinstance(layer, LinearBase) or parallel_lm_head_quantized: # False = skip module, None = no override, else = Positive match - if self.get_dynamic_override(layer_name=prefix) == False: + if self.get_dynamic_override(layer_name=prefix) == False: return UnquantizedEmbeddingMethod( ) if parallel_lm_head_quantized else UnquantizedLinearMethod() - return GPTQMarlinLinearMethod(self, prefix=prefix) + quant_config = deepcopy(self) + + if len(quant_config.dynamic) > 0 and prefix: + # Dynamic per module/layer rules may override base config + quant_config.override_config(prefix=prefix) + + return GPTQMarlinLinearMethod(quant_config) elif isinstance(layer, FusedMoE): return GPTQMarlinMoEMethod(self) return None @@ -252,13 +258,8 @@ class GPTQMarlinLinearMethod(LinearMethodBase): _kernel_backends_being_used: Set[str] = set() - def __init__(self, quant_config: GPTQMarlinConfig, prefix: str) -> None: - self.quant_config = deepcopy(quant_config) - self.prefix = prefix - - if len(self.quant_config.dynamic) > 0 and self.prefix: - # Dynamic per module/layer rules may override base config - self.quant_config.override_config(prefix=self.prefix) + def __init__(self, quant_config: GPTQMarlinConfig) -> None: + self.quant_config = quant_config # Verify supported on platform. verify_marlin_supported(quant_type=self.quant_config.quant_type, From 5623936795422e82254b01e1f799eec4cc7a9300 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Fri, 7 Feb 2025 16:13:22 +0000 Subject: [PATCH 45/56] cleanup Signed-off-by: ZX-ModelCloud --- vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index d3a7253c7686..6f1be14c9be6 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -214,7 +214,7 @@ def get_quant_method( quant_config = deepcopy(self) - if len(quant_config.dynamic) > 0 and prefix: + if prefix: # Dynamic per module/layer rules may override base config quant_config.override_config(prefix=prefix) From e41bdd760b9aa5948c20fa69c5bbd9a2a03c69ae Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Sat, 8 Feb 2025 00:14:54 +0800 Subject: [PATCH 46/56] make lint pass --- vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 6f1be14c9be6..d910587eb3b5 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -208,7 +208,7 @@ def get_quant_method( layer, ParallelLMHead) and self.lm_head_quantized if isinstance(layer, LinearBase) or parallel_lm_head_quantized: # False = skip module, None = no override, else = Positive match - if self.get_dynamic_override(layer_name=prefix) == False: + if self.get_dynamic_override(layer_name=prefix) == False: # noqa: E712 return UnquantizedEmbeddingMethod( ) if parallel_lm_head_quantized else UnquantizedLinearMethod() From 965d7daa1ba320b3389f57eea10ab7d68e82eec9 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 11 Feb 2025 02:26:01 +0000 Subject: [PATCH 47/56] change model_id Signed-off-by: ZX-ModelCloud --- tests/quantization/test_gptq_dynamic.py | 2 +- tests/quantization/test_lm_head.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index 2262a51a5b33..37db57dc8d98 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -17,7 +17,7 @@ # The second layer is quantized using bits=8, group_size=32 # All other layers (layer index >= 2) are not quantized MODEL_QUANT = [ - "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg-with-lm_head" + "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head" ] diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index 812d3d6825a6..7343e56099ad 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -18,7 +18,7 @@ PROMPT = "On the surface of Mars, we found" MODELS_QUANT = [ - ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bits-dynamic-cfg-with-lm_head", + ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True), ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False), From 1a34027aedb408329f85a3d3f6c3d7b375c179de Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 11 Feb 2025 03:07:13 +0000 Subject: [PATCH 48/56] format Signed-off-by: ZX-ModelCloud --- tests/quantization/test_lm_head.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index 7343e56099ad..3bbf1f0e4765 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -18,8 +18,7 @@ PROMPT = "On the surface of Mars, we found" MODELS_QUANT = [ - ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", - True), + ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True), ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False), ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False) From 0b249a1f9e37c07e3b4ac9cc84a0a1383abb3dd3 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 11 Feb 2025 03:19:53 +0000 Subject: [PATCH 49/56] format code Signed-off-by: ZX-ModelCloud --- .../layers/vocab_parallel_embedding.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 85081a26c149..f65dfc3cb329 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -261,12 +261,12 @@ def __init__(self, self.shard_indices.added_vocab_start_index) self.quant_method.create_weights(self, - self.embedding_dim, - [self.num_embeddings_per_partition], - self.embedding_dim, - self.num_embeddings_padded, - params_dtype=params_dtype, - weight_loader=self.weight_loader) + self.embedding_dim, + [self.num_embeddings_per_partition], + self.embedding_dim, + self.num_embeddings_padded, + params_dtype=params_dtype, + weight_loader=self.weight_loader) @classmethod def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int, @@ -413,7 +413,7 @@ def forward(self, input_): masked_input = input_ # Get the embeddings. output_parallel = self.quant_method.embedding(self, - masked_input.long()) + masked_input.long()) # Mask the output embedding. if self.tp_size > 1: output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0) From 4de04ae40bd9473d7039d365f8f55533cdaeff10 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 11 Feb 2025 03:39:53 +0000 Subject: [PATCH 50/56] format code Signed-off-by: ZX-ModelCloud --- vllm/model_executor/layers/logits_processor.py | 4 ++-- vllm/model_executor/layers/quantization/gptq_marlin.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 7c907242de3e..9b1742998578 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -109,8 +109,8 @@ def _get_logits( ) -> Optional[torch.Tensor]: # Get the logits for the next tokens. logits = lm_head.quant_method.apply(lm_head, - hidden_states, - bias=embedding_bias) + hidden_states, + bias=embedding_bias) # Gather logits for TP logits = self._gather_logits(logits) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index d910587eb3b5..649d09c62877 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -208,7 +208,8 @@ def get_quant_method( layer, ParallelLMHead) and self.lm_head_quantized if isinstance(layer, LinearBase) or parallel_lm_head_quantized: # False = skip module, None = no override, else = Positive match - if self.get_dynamic_override(layer_name=prefix) == False: # noqa: E712 + if self.get_dynamic_override( + layer_name=prefix) == False: # noqa: E712 return UnquantizedEmbeddingMethod( ) if parallel_lm_head_quantized else UnquantizedLinearMethod() From 4c0608b3d9ec61df6d1e8f5df97d58ec08cbc75f Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 11 Feb 2025 03:42:33 +0000 Subject: [PATCH 51/56] format code Signed-off-by: ZX-ModelCloud --- vllm/model_executor/layers/quantization/gptq_marlin.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 649d09c62877..76b5e81cfa6b 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -208,10 +208,11 @@ def get_quant_method( layer, ParallelLMHead) and self.lm_head_quantized if isinstance(layer, LinearBase) or parallel_lm_head_quantized: # False = skip module, None = no override, else = Positive match - if self.get_dynamic_override( - layer_name=prefix) == False: # noqa: E712 - return UnquantizedEmbeddingMethod( - ) if parallel_lm_head_quantized else UnquantizedLinearMethod() + if self.get_dynamic_override( + layer_name=prefix) == False: # noqa: E712 + if parallel_lm_head_quantized: + return UnquantizedEmbeddingMethod() + return UnquantizedLinearMethod() quant_config = deepcopy(self) From 8f2137547ef1cb4abf1524e21b05000670ec9e6c Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 11 Feb 2025 03:47:50 +0000 Subject: [PATCH 52/56] disable E712 ruff check Signed-off-by: ZX-ModelCloud --- vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 76b5e81cfa6b..e0060f74a9ac 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -208,7 +208,7 @@ def get_quant_method( layer, ParallelLMHead) and self.lm_head_quantized if isinstance(layer, LinearBase) or parallel_lm_head_quantized: # False = skip module, None = no override, else = Positive match - if self.get_dynamic_override( + if self.get_dynamic_override( # noqa: E712 layer_name=prefix) == False: # noqa: E712 if parallel_lm_head_quantized: return UnquantizedEmbeddingMethod() From e3084e3586086b069661ad431296540bf9c67204 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 11 Feb 2025 08:28:04 +0000 Subject: [PATCH 53/56] Extract code to gptq_utils.get_linear_quant_method() Signed-off-by: ZX-ModelCloud --- tests/quantization/test_gptq_dynamic.py | 30 +++--- .../layers/quantization/gptq.py | 47 +++++++-- .../layers/quantization/gptq_marlin.py | 77 ++------------- .../layers/quantization/utils/gptq_utils.py | 98 +++++++++++++++++++ 4 files changed, 163 insertions(+), 89 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/utils/gptq_utils.py diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index 37db57dc8d98..88882a5bdbdd 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -8,8 +8,11 @@ import torch from vllm.model_executor.layers.linear import UnquantizedLinearMethod +from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod from vllm.model_executor.layers.quantization.gptq_marlin import ( GPTQMarlinLinearMethod) +from vllm.model_executor.layers.quantization.utils.gptq_utils import ( + get_dynamic_override) PROMPT = "On the surface of Mars, we found" @@ -17,7 +20,8 @@ # The second layer is quantized using bits=8, group_size=32 # All other layers (layer index >= 2) are not quantized MODEL_QUANT = [ - "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head" + "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", + "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse", ] @@ -25,14 +29,18 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str): vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) + use_marlin_kernel = "symTrue" in model_id + linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else ( + GPTQLinearMethod) + for name, submodule in (vllm_model.model.llm_engine.model_executor. driver_worker.model_runner.model.named_modules()): if name == "lm_head": - assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod) + assert isinstance(submodule.quant_method, linear_method_cls) elif name == 'model.layers.0.self_attn.qkv_proj': # The first layer is quantized using bits=4, group_size=128 # desc_act=True - assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod) + assert isinstance(submodule.quant_method, linear_method_cls) config = submodule.quant_method.quant_config assert config.weight_bits == 4 assert config.group_size == 128 @@ -40,18 +48,18 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str): elif name == 'model.layers.1.self_attn.qkv_proj': # The second layer is quantized using bits=8, group_size=32 # desc_act=False - assert isinstance(submodule.quant_method, GPTQMarlinLinearMethod) + assert isinstance(submodule.quant_method, linear_method_cls) config = submodule.quant_method.quant_config - assert config.get_dynamic_override(layer_name=name, - key="bits") == 8 - assert config.get_dynamic_override(layer_name=name, - key="group_size") == 32 - assert not config.get_dynamic_override(layer_name=name, - key="desc_act") + assert get_dynamic_override(config, layer_name=name, + key="bits") == 8 + assert get_dynamic_override(config, + layer_name=name, + key="group_size") == 32 + assert not get_dynamic_override( + config, layer_name=name, key="desc_act") elif (name == 'model.layers.2.self_attn.qkv_proj' or name == 'model.layers.2.mlp.gate_up_proj'): # All other layers (layer index >= 2) are not quantized assert isinstance(submodule.quant_method, UnquantizedLinearMethod) - print(vllm_model.generate_greedy(prompts=[PROMPT], max_tokens=10)[0][1]) del vllm_model diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 0cb77a7546d1..6d1f0cc2eb4d 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -3,16 +3,17 @@ import enum from enum import Enum from fractions import Fraction -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import torch from torch.nn.parameter import Parameter from vllm import _custom_ops as ops -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.linear import LinearMethodBase from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.layers.quantization.utils.gptq_utils import ( + get_linear_quant_method) from vllm.model_executor.parameter import (ChannelQuantScaleParameter, GroupQuantScaleParameter, PackedColumnParameter, @@ -32,7 +33,33 @@ def __init__( group_size: int, desc_act: bool, lm_head_quantized: bool, + dynamic: Dict[str, Dict[str, Union[int, bool]]], ) -> None: + # GPTQModel use `dynamic` config property to allow per module + # quantization config so each module can be individually optimized. + # Format is Dict[str, Dict] where key is a regex string that can + # perform both positive ("+:" prefixed) or negative ("-:" prefixed) + # matching of a module. + # Default to positive match, override base quant config mode, if no + # prefix is used. Value is in dict format of field key and override + # value. + # Negative matching will skip quantization init for this module + # entirely: + # non-quantized inference. More details and quantization examples can be + # found at: https://github.com/ModelCloud/GPTQModel + # Example: + # # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9 + # # last 1/4 of the layers 16-21 has 8bit and group_size 64 + # dynamic = { + # #`.*\.` matches the layers_node prefix + # # positive match layer 10-15 + # r"+:.*\.(?:1[0-5])\..*": {"bits": 8,}, + # # positive match layer 16-21 + # r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,}, + # r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers + # } + self.dynamic = dynamic + self.weight_bits = weight_bits self.group_size = group_size self.desc_act = desc_act @@ -47,7 +74,8 @@ def __repr__(self) -> str: return (f"GPTQConfig(weight_bits={self.weight_bits}, " f"group_size={self.group_size}, " f"desc_act={self.desc_act})," - f"lm_head_quantized={self.lm_head_quantized}") + f"lm_head_quantized={self.lm_head_quantized}), " + f"dynamic={self.dynamic}") @classmethod def get_name(cls) -> str: @@ -68,19 +96,20 @@ def get_config_filenames(cls) -> List[str]: @classmethod def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig": + dynamic = cls.get_from_keys_or(config, ["dynamic"], default={}) + dynamic = {} if dynamic is None else dynamic + weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) desc_act = cls.get_from_keys(config, ["desc_act"]) lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) - return cls(weight_bits, group_size, desc_act, lm_head_quantized) + return cls(weight_bits, group_size, desc_act, lm_head_quantized, + dynamic) def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["GPTQLinearMethod"]: - if (isinstance(layer, LinearBase) or - (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)): - return GPTQLinearMethod(self) - return None + return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod) class ExllamaState(Enum): diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index e0060f74a9ac..0a9d86b008db 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import re -from copy import deepcopy from typing import Any, Callable, Dict, List, Optional, Set, Union import torch @@ -11,7 +9,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) -from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, +from vllm.model_executor.layers.linear import (LinearMethodBase, UnquantizedLinearMethod, set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import ( @@ -19,11 +17,13 @@ from vllm.model_executor.layers.quantization.kernels.mixed_precision import ( MPLinearLayerConfig, choose_mp_linear_kernel) from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.gptq_utils import ( + get_linear_quant_method) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_marlin_supported, marlin_moe_permute_scales, marlin_repeat_scales_on_all_ranks, verify_marlin_supported) from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, UnquantizedEmbeddingMethod) + UnquantizedEmbeddingMethod) from vllm.model_executor.parameter import (ChannelQuantScaleParameter, GroupQuantScaleParameter, PackedColumnParameter, @@ -97,32 +97,6 @@ def __init__( self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)] - # Match dynamic rules with module name (prefix) and override quantize - # config if module (prefix) matches a rule - def override_config(self, prefix: str): - bits = self.weight_bits - - b = self.get_dynamic_override(prefix, "bits", bits) - if isinstance(b, int): - bits = b - group_size = self.get_dynamic_override(prefix, "group_size", - self.group_size) - if isinstance(group_size, int): - self.group_size = group_size - desc_act = self.get_dynamic_override(prefix, "desc_act", self.desc_act) - if isinstance(desc_act, bool): - self.desc_act = desc_act - is_sym = self.get_dynamic_override(prefix, "sym", self.is_sym) - if isinstance(is_sym, bool): - self.is_sym = is_sym - - self.pack_factor = 32 // bits # packed into int32 - if (bits, self.is_sym) not in self.TYPE_MAP: - raise ValueError("Unsupported quantization config: " - f"bits={bits}, sym={self.is_sym}") - - self.quant_type = self.TYPE_MAP[(bits, self.is_sym)] - def __repr__(self) -> str: return (f"GPTQMarlinConfig(quant_type={self.quant_type}, " f"group_size={self.group_size}, " @@ -150,6 +124,7 @@ def get_config_filenames(cls) -> List[str]: def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig": dynamic = cls.get_from_keys_or(config, ["dynamic"], default={}) dynamic = {} if dynamic is None else dynamic + weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) desc_act = cls.get_from_keys(config, ["desc_act"]) @@ -180,50 +155,14 @@ def override_quantization_method(cls, hf_quant_cfg, " faster inference") return None - def get_dynamic_override( - self, - layer_name: str, - key: Optional[str] = None, - default_value: Union[int, bool, None] = None - ) -> Union[Dict, int, bool, None]: - for pattern, pattern_dict in self.dynamic.items(): - # Negative match: matched modules are excluded from quantized init - if pattern.startswith("-:"): - if re.match(pattern.removeprefix("-:"), layer_name): - return False - # Positive match: matched modules have quant properties overrides - # base quant config - elif re.match(pattern.removeprefix("+:"), layer_name): - if key is None: - return pattern_dict - else: - return pattern_dict.get(key, default_value) - return default_value - def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod", UnquantizedLinearMethod, UnquantizedEmbeddingMethod]]: - parallel_lm_head_quantized = isinstance( - layer, ParallelLMHead) and self.lm_head_quantized - if isinstance(layer, LinearBase) or parallel_lm_head_quantized: - # False = skip module, None = no override, else = Positive match - if self.get_dynamic_override( # noqa: E712 - layer_name=prefix) == False: # noqa: E712 - if parallel_lm_head_quantized: - return UnquantizedEmbeddingMethod() - return UnquantizedLinearMethod() - - quant_config = deepcopy(self) - - if prefix: - # Dynamic per module/layer rules may override base config - quant_config.override_config(prefix=prefix) - - return GPTQMarlinLinearMethod(quant_config) - elif isinstance(layer, FusedMoE): + if isinstance(layer, FusedMoE): return GPTQMarlinMoEMethod(self) - return None + return get_linear_quant_method(self, layer, prefix, + GPTQMarlinLinearMethod) @classmethod def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]): diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py new file mode 100644 index 000000000000..7552f9d1705a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: Apache-2.0 +import re +from copy import deepcopy +from fractions import Fraction +from typing import Dict, Optional, Union + +import torch + +from vllm.config import QuantizationConfig +from vllm.model_executor.layers.linear import (LinearBase, + UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizeMethodBase) +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, UnquantizedEmbeddingMethod) + + +# Match dynamic rules with module name (prefix) and override quantize +# config if module (prefix) matches a rule +def override_config(config: QuantizationConfig, prefix: str): + weight_bits = get_dynamic_override(config, prefix, "bits", + config.weight_bits) + if isinstance(weight_bits, int): + config.weight_bits = weight_bits + group_size = get_dynamic_override(config, prefix, "group_size", + config.group_size) + if isinstance(group_size, int): + config.group_size = group_size + desc_act = get_dynamic_override(config, prefix, "desc_act", + config.desc_act) + if isinstance(desc_act, bool): + config.desc_act = desc_act + + if config.get_name() == "gptq_marlin": + is_sym = get_dynamic_override(config, prefix, "sym", config.is_sym) + if isinstance(is_sym, bool): + config.is_sym = is_sym + + config.pack_factor = 32 // config.weight_bits # packed into int32 + if (config.weight_bits, config.is_sym) not in config.TYPE_MAP: + raise ValueError("Unsupported quantization config: " + f"bits={config.weight_bits}, sym={config.is_sym}") + + config.quant_type = config.TYPE_MAP[(config.weight_bits, + config.is_sym)] + elif config.get_name() == "gptq": + config.pack_factor = Fraction(32, config.weight_bits) + if config.weight_bits not in [2, 3, 4, 8]: + raise ValueError( + "Currently, only 2/3/4/8-bit weight quantization is " + f"supported for GPTQ, but got {config.weight_bits} bits.") + + +def get_dynamic_override( + config: QuantizationConfig, + layer_name: str, + key: Optional[str] = None, + default_value: Union[int, bool, + None] = None) -> Union[Dict, int, bool, None]: + for pattern, pattern_dict in config.dynamic.items(): + # Negative match: matched modules are excluded from quantized init + if pattern.startswith("-:"): + if re.match(pattern.removeprefix("-:"), layer_name): + return False + # Positive match: matched modules have quant properties overrides + # base quant config + elif re.match(pattern.removeprefix("+:"), layer_name): + if key is None: + return pattern_dict + else: + return pattern_dict.get(key, default_value) + return default_value + + +def get_linear_quant_method( + config: QuantizationConfig, + layer: torch.nn.Module, + prefix: str, + linear_method_cls: type, +) -> Optional[QuantizeMethodBase]: + cloned_config = deepcopy(config) + parallel_lm_head_quantized = isinstance( + layer, ParallelLMHead) and cloned_config.lm_head_quantized + if isinstance(layer, LinearBase) or parallel_lm_head_quantized: + # False = skip module, None = no override, else = Positive match + if get_dynamic_override( # noqa: E712 + cloned_config, # noqa: E712 + layer_name=prefix) == False: # noqa: E712 + if parallel_lm_head_quantized: + return UnquantizedEmbeddingMethod() + return UnquantizedLinearMethod() + + if prefix: + # Dynamic per module/layer rules may override base config + override_config(cloned_config, prefix=prefix) + + return linear_method_cls(cloned_config) + return None From 25dbd5a5aaeebc7162ec61b8a6a8f95e5911e69e Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 11 Feb 2025 08:46:46 +0000 Subject: [PATCH 54/56] cleanup Signed-off-by: ZX-ModelCloud --- tests/quantization/test_gptq_dynamic.py | 15 ++++++++++----- .../layers/quantization/utils/gptq_utils.py | 13 +++++++++---- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index 88882a5bdbdd..71ad0835c8a3 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -3,6 +3,7 @@ Run `pytest tests/quantization/test_gptq_dynamic.py --forked`. """ +from typing import Tuple import pytest import torch @@ -20,16 +21,20 @@ # The second layer is quantized using bits=8, group_size=32 # All other layers (layer index >= 2) are not quantized MODEL_QUANT = [ - "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", - "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse", + ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", + True), + ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse", + False), ] -@pytest.mark.parametrize("model_id", MODEL_QUANT) -def test_gptq_with_dynamic(vllm_runner, model_id: str): +@pytest.mark.parametrize("model_id_and_use_marlin_kernel", MODEL_QUANT) +def test_gptq_with_dynamic(vllm_runner, + model_id_and_use_marlin_kernel: Tuple[str, bool]): + model_id, use_marlin_kernel = model_id_and_use_marlin_kernel + vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) - use_marlin_kernel = "symTrue" in model_id linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else ( GPTQLinearMethod) diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py index 7552f9d1705a..1e85316f0110 100644 --- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py +++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py @@ -2,18 +2,21 @@ import re from copy import deepcopy from fractions import Fraction -from typing import Dict, Optional, Union +from typing import TYPE_CHECKING, Dict, Optional, Union import torch from vllm.config import QuantizationConfig from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) -from vllm.model_executor.layers.quantization.base_config import ( - QuantizeMethodBase) from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, UnquantizedEmbeddingMethod) +if TYPE_CHECKING: + from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod + from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinLinearMethod, GPTQMarlinMoEMethod) + # Match dynamic rules with module name (prefix) and override quantize # config if module (prefix) matches a rule @@ -77,7 +80,9 @@ def get_linear_quant_method( layer: torch.nn.Module, prefix: str, linear_method_cls: type, -) -> Optional[QuantizeMethodBase]: +) -> Optional[Union["GPTQLinearMethod", "GPTQMarlinLinearMethod", + "GPTQMarlinMoEMethod", UnquantizedLinearMethod, + UnquantizedEmbeddingMethod]]: cloned_config = deepcopy(config) parallel_lm_head_quantized = isinstance( layer, ParallelLMHead) and cloned_config.lm_head_quantized From 874076c52423a87dd71f6252811df0e129324636 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Tue, 11 Feb 2025 09:07:01 +0000 Subject: [PATCH 55/56] cleanup Signed-off-by: ZX-ModelCloud --- tests/quantization/test_gptq_dynamic.py | 8 +++----- tests/quantization/test_lm_head.py | 10 ++++------ .../layers/quantization/utils/gptq_utils.py | 11 ++--------- 3 files changed, 9 insertions(+), 20 deletions(-) diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index 71ad0835c8a3..c6f34fef2743 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -3,7 +3,6 @@ Run `pytest tests/quantization/test_gptq_dynamic.py --forked`. """ -from typing import Tuple import pytest import torch @@ -28,10 +27,9 @@ ] -@pytest.mark.parametrize("model_id_and_use_marlin_kernel", MODEL_QUANT) -def test_gptq_with_dynamic(vllm_runner, - model_id_and_use_marlin_kernel: Tuple[str, bool]): - model_id, use_marlin_kernel = model_id_and_use_marlin_kernel +@pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT) +def test_gptq_with_dynamic(vllm_runner, model_id: str, + use_marlin_kernel: bool): vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index 3bbf1f0e4765..20435a287e37 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -3,7 +3,6 @@ Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`. """ -from typing import Tuple import pytest import torch @@ -25,14 +24,13 @@ ] -@pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT) +@pytest.mark.parametrize("model_id, lm_head_quantized", MODELS_QUANT) def test_lm_head( vllm_runner, - model_lm_head_quant: Tuple[str, bool], + model_id: str, + lm_head_quantized: bool, ) -> None: - model, lm_head_quantized = model_lm_head_quant - - with vllm_runner(model, dtype=torch.float16, + with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as vllm_model: def check_model(model): diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py index 1e85316f0110..fe2a7c959f6c 100644 --- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py +++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py @@ -2,7 +2,7 @@ import re from copy import deepcopy from fractions import Fraction -from typing import TYPE_CHECKING, Dict, Optional, Union +from typing import Dict, Optional, Union import torch @@ -12,11 +12,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, UnquantizedEmbeddingMethod) -if TYPE_CHECKING: - from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod - from vllm.model_executor.layers.quantization.gptq_marlin import ( - GPTQMarlinLinearMethod, GPTQMarlinMoEMethod) - # Match dynamic rules with module name (prefix) and override quantize # config if module (prefix) matches a rule @@ -80,9 +75,7 @@ def get_linear_quant_method( layer: torch.nn.Module, prefix: str, linear_method_cls: type, -) -> Optional[Union["GPTQLinearMethod", "GPTQMarlinLinearMethod", - "GPTQMarlinMoEMethod", UnquantizedLinearMethod, - UnquantizedEmbeddingMethod]]: +): cloned_config = deepcopy(config) parallel_lm_head_quantized = isinstance( layer, ParallelLMHead) and cloned_config.lm_head_quantized From c7f10be8548a3e31f7d472b8f151164d4a2ed3a9 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Wed, 12 Feb 2025 03:04:38 +0000 Subject: [PATCH 56/56] do not use Fraction Signed-off-by: ZX-ModelCloud --- vllm/model_executor/layers/quantization/utils/gptq_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py index fe2a7c959f6c..5b0e6299f473 100644 --- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py +++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import re from copy import deepcopy -from fractions import Fraction from typing import Dict, Optional, Union import torch @@ -29,12 +28,12 @@ def override_config(config: QuantizationConfig, prefix: str): if isinstance(desc_act, bool): config.desc_act = desc_act + config.pack_factor = 32 // config.weight_bits # packed into int32 if config.get_name() == "gptq_marlin": is_sym = get_dynamic_override(config, prefix, "sym", config.is_sym) if isinstance(is_sym, bool): config.is_sym = is_sym - config.pack_factor = 32 // config.weight_bits # packed into int32 if (config.weight_bits, config.is_sym) not in config.TYPE_MAP: raise ValueError("Unsupported quantization config: " f"bits={config.weight_bits}, sym={config.is_sym}") @@ -42,7 +41,6 @@ def override_config(config: QuantizationConfig, prefix: str): config.quant_type = config.TYPE_MAP[(config.weight_bits, config.is_sym)] elif config.get_name() == "gptq": - config.pack_factor = Fraction(32, config.weight_bits) if config.weight_bits not in [2, 3, 4, 8]: raise ValueError( "Currently, only 2/3/4/8-bit weight quantization is "