From 4ffde1b6d7ba2ae0b3273c461bfaf86da9a3fa10 Mon Sep 17 00:00:00 2001
From: Elfie Guo <elfieg@nvidia.com>
Date: Fri, 20 Sep 2024 04:42:33 +0000
Subject: [PATCH 1/3] Default to use per_token quantization for fp8 when
 cutlass is supported.

---
 vllm/model_executor/layers/quantization/fp8.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index b5feb55db0e7..c70838134739 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -276,7 +276,8 @@ def apply(self,
             input_scale=layer.input_scale,
             bias=bias,
             cutlass_fp8_supported=self.cutlass_fp8_supported,
-            use_per_token_if_dynamic=False)
+            # Default to using per_token quantization if cutalss fp8 is supported.
+            use_per_token_if_dynamic=self.cutlass_fp8_supported)
 
 
 class Fp8MoEMethod(FusedMoEMethodBase):

From 987c825d96e5c3251df22865f6368de16b51987b Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin@redhat.com>
Date: Wed, 15 Jan 2025 10:51:58 -0500
Subject: [PATCH 2/3] Update vllm/model_executor/layers/quantization/fp8.py

---
 vllm/model_executor/layers/quantization/fp8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index c70838134739..b6aebc9adbbd 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -276,7 +276,7 @@ def apply(self,
             input_scale=layer.input_scale,
             bias=bias,
             cutlass_fp8_supported=self.cutlass_fp8_supported,
-            # Default to using per_token quantization if cutalss fp8 is supported.
+            # Default to using per_token quantization if cutlass fp8 is supported.
             use_per_token_if_dynamic=self.cutlass_fp8_supported)
 
 

From c0f2283487d5951f893699c973c216db9c5c5a22 Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Wed, 15 Jan 2025 15:54:05 +0000
Subject: [PATCH 3/3] Format

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index ae98a6f085b1..4969ee559522 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -355,7 +355,7 @@ def apply(self,
             input_scale=layer.input_scale,
             bias=bias,
             cutlass_fp8_supported=self.cutlass_fp8_supported,
-            # Default to using per_token quantization if cutlass fp8 is supported.
+            # Default to using per_token quantization if cutlass is supported
             use_per_token_if_dynamic=self.cutlass_fp8_supported)