Fix Per Row scaling for inference

drisspg · drisspg · commit ae92547f135e · 2025-05-23T11:32:13.000-07:00
stack-info: PR: #2253, branch: drisspg/stack/56
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -462,10 +462,10 @@ def from_hp_to_floatx(
         if target_dtype in FP8_TYPES:
             original_shape = input_float.shape
             input_float = _layout.pre_process(input_float)
-
-            scale = choose_qparams_affine_float8(input_float, float8_dtype=target_dtype)
+            scale = choose_qparams_affine_float8(
+                input_float, float8_dtype=target_dtype, block_size=block_size
+            )
             data = quantize_affine_float8(input_float, scale, target_dtype)
-
             data, scale, zero_point = _layout.post_process(
                 data, scale, None, block_size
             )
diff --git a/torchao/dtypes/floatx/float8_layout.py b/torchao/dtypes/floatx/float8_layout.py
@@ -333,13 +333,12 @@ def _linear_fp8_act_fp8_weight_impl(
     input_scale = input_tensor.tensor_impl.scale
     # Handle case where input tensor is more than 2D
     inpt_data = inpt_data.reshape(-1, inpt_data.shape[-1])
-
     # Handle rowwise case
     if _is_rowwise_scaled(weight_tensor):
         assert _is_rowwise_scaled(input_tensor), (
             "Input tensor must be rowwise block size"
         )
-        w_scale = w_scale.unsqueeze(-1).T
+        w_scale = w_scale.T
         input_scale = preprocess_scale(input_scale, input_tensor.shape)
 
     # Preprocess data
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -1970,6 +1970,7 @@ def choose_qparams_affine_float8(
     tensor: torch.Tensor,
     float8_dtype: torch.dtype = torch.float8_e4m3fn,
     scale_dtype: torch.dtype = torch.float32,
+    block_size: Optional[Tuple[int, ...]] = None,
 ) -> torch.Tensor:
     """
     Calculates float8 scaling factor for the given high precision tensor, using tensorwise granularity.
@@ -1978,12 +1979,27 @@ def choose_qparams_affine_float8(
         tensor (torch.Tensor): Input tensor to be quantized.
         float8_dtype (torch.dtype): Data type of the quantized tensor (e.g., torch.float8_e4m3fn, torch.float8_e5m2).
     """
+    quant_max = torch.finfo(float8_dtype).max
     # only tensorwise scaling is supported for now:
-    quant_min, quant_max = torch.finfo(float8_dtype).min, torch.finfo(float8_dtype).max
-    min_val_neg = torch.min(tensor)
-    max_val_pos = torch.max(tensor)
-    max_val_pos = torch.max(-min_val_neg, max_val_pos)
-    scale = max_val_pos / (float(quant_max - quant_min) / 2)
+    if block_size is None:
+        max_abs = tensor.abs().max()
+        scale = max_abs / quant_max
+    else:
+        shape_for_reduction, reduction_dims = _get_reduction_params(
+            block_size, tensor.shape
+        )
+        tensor_reshaped = tensor.view(shape_for_reduction)
+        max_abs = tensor_reshaped.abs().amax(dim=reduction_dims, keepdim=True)
+
+        scale = max_abs / quant_max
+        # Reshape scale back to match the expected output shape
+        # The scale tensor should have the same shape as the input divided by block_size
+        output_shape = [
+            input_size // block_size[i] if block_size[i] > 1 else input_size
+            for i, input_size in enumerate(tensor.shape)
+        ]
+        scale = scale.reshape(output_shape)
+
     return scale.to(dtype=scale_dtype)
 
 
@@ -2027,5 +2043,24 @@ def dequantize_affine_float8(
     # upcasted to `float32` to divide by the scale, since scale is a fp32 for float8 quantization.
     # In order to match numerics between eager and compile, we upcast manually here.
     fp8_tensor = tensor.to(torch.float32)
-    hp_tensor = fp8_tensor * scale
+    # For block-wise quantization, we need to broadcast the scale to match tensor dimensions
+    if scale.shape != tensor.shape:
+        # Calculate the block size from the shape difference
+        block_size = tuple(
+            tensor.shape[i] // scale.shape[i]
+            if scale.shape[i] != tensor.shape[i]
+            else 1
+            for i in range(len(tensor.shape))
+        )
+
+        scale_expanded = scale
+        for i in range(len(tensor.shape)):
+            if block_size[i] > 1:
+                # Repeat the scale values for each block
+                scale_expanded = scale_expanded.repeat_interleave(block_size[i], dim=i)
+    else:
+        # Tensor-wise quantization - scale already matches
+        scale_expanded = scale
+
+    hp_tensor = fp8_tensor * scale_expanded
     return hp_tensor.to(output_dtype)