From 4ffde1b6d7ba2ae0b3273c461bfaf86da9a3fa10 Mon Sep 17 00:00:00 2001 From: Elfie Guo Date: Fri, 20 Sep 2024 04:42:33 +0000 Subject: [PATCH 1/3] Default to use per_token quantization for fp8 when cutlass is supported. --- vllm/model_executor/layers/quantization/fp8.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index b5feb55db0e7..c70838134739 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -276,7 +276,8 @@ def apply(self, input_scale=layer.input_scale, bias=bias, cutlass_fp8_supported=self.cutlass_fp8_supported, - use_per_token_if_dynamic=False) + # Default to using per_token quantization if cutalss fp8 is supported. + use_per_token_if_dynamic=self.cutlass_fp8_supported) class Fp8MoEMethod(FusedMoEMethodBase): From 987c825d96e5c3251df22865f6368de16b51987b Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 15 Jan 2025 10:51:58 -0500 Subject: [PATCH 2/3] Update vllm/model_executor/layers/quantization/fp8.py --- vllm/model_executor/layers/quantization/fp8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index c70838134739..b6aebc9adbbd 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -276,7 +276,7 @@ def apply(self, input_scale=layer.input_scale, bias=bias, cutlass_fp8_supported=self.cutlass_fp8_supported, - # Default to using per_token quantization if cutalss fp8 is supported. + # Default to using per_token quantization if cutlass fp8 is supported. use_per_token_if_dynamic=self.cutlass_fp8_supported) From c0f2283487d5951f893699c973c216db9c5c5a22 Mon Sep 17 00:00:00 2001 From: mgoin Date: Wed, 15 Jan 2025 15:54:05 +0000 Subject: [PATCH 3/3] Format Signed-off-by: mgoin --- vllm/model_executor/layers/quantization/fp8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index ae98a6f085b1..4969ee559522 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -355,7 +355,7 @@ def apply(self, input_scale=layer.input_scale, bias=bias, cutlass_fp8_supported=self.cutlass_fp8_supported, - # Default to using per_token quantization if cutlass fp8 is supported. + # Default to using per_token quantization if cutlass is supported use_per_token_if_dynamic=self.cutlass_fp8_supported)