diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py index bc697ef93b34..21e6fe7a2261 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py @@ -42,7 +42,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, if not sparse_cutlass_supported(): raise ValueError( - "Sparse CUTLASS not supported. vLLM must be built with" + "Sparse CUTLASS not supported. vLLM must be built with " "CUDA 12.2 or later to use this feature") self.output_dtype = params_dtype diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 91225c0ddc91..5b97eced62df 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -390,8 +390,7 @@ def load_weights(self, weights: Iterable[Tuple[str, continue if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 6517422697c0..989056bf5c15 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -440,8 +440,7 @@ def load_weights(self, weights: Iterable[Tuple[str, if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index ff1f1c2a939f..b2aa3c0709bd 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -452,8 +452,7 @@ def load_weights(self, weights: Iterable[Tuple[str, for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index ac679d6ff43c..eab3bf0756fc 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -533,8 +533,7 @@ def load_weights(self, weights: Iterable[Tuple[str, continue if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 56343ca9a71a..08298cc0db36 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -316,8 +316,7 @@ def load_weights(self, weights: Iterable[Tuple[str, if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 67e04b57658b..ddd2d7a16b24 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -475,8 +475,7 @@ def load_weights(self, weights: Iterable[Tuple[str, continue if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 4667f275ecd3..a5bd418801f2 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -105,9 +105,9 @@ def __init__(self, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, + bias_o_proj: bool = False, cache_config: Optional[CacheConfig] = None, - prefix: str = "", - bias_o_proj: bool = False) -> None: + prefix: str = "") -> None: super().__init__() layer_idx = extract_layer_index(prefix) self.hidden_size = hidden_size @@ -397,8 +397,7 @@ def load_weights(self, weights: Iterable[Tuple[str, continue if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 2c8895e84299..da415cdae96e 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -431,8 +431,7 @@ def load_weights(self, weights: Iterable[Tuple[str, if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index bd261f31499c..2554281610a3 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1432,8 +1432,7 @@ def load_weights(self, weights: Iterable[Tuple[str, loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1) if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index e7875e6fb889..2340283b6966 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -492,8 +492,7 @@ def load_weights(self, weights: Iterable[Tuple[str, continue if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index dc76818e22cb..881c09ea9db9 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -626,8 +626,7 @@ def load_weights(self, weights: Iterable[Tuple[str, if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index b9c259ad73c4..d015f60c6d06 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -367,8 +367,7 @@ def load_weights(self, weights: Iterable[Tuple[str, continue if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index b27d2b10850f..37c5a4b5713b 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -492,8 +492,7 @@ def load_weights(self, weights: Iterable[Tuple[str, continue if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): - # Loading kv cache scales for quark and - # compressed-tensors quantization + # Loading kv cache quantization scales param = params_dict[scale_name] weight_loader = getattr(param, "weight_loader", default_weight_loader)