Add runtime swap AB for SM100 blockwise GEMM

Barry-Delaney · Barry-Delaney · commit 0160ff0649ea · 2025-08-13T16:05:24.000+08:00
Signed-off-by: Barry Kang &lt;43644113+Barry-Delaney@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -45,8 +45,8 @@
 from tensorrt_llm.llmapi.utils import enable_llm_debug
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.modeling_utils import QuantConfig
-from tensorrt_llm.quantization.utils.fp8_utils import (
-    resmooth_to_fp8_e8m0, transform_sf_into_required_layout)
+from tensorrt_llm.quantization.utils.fp8_utils import \
+    transform_sf_into_required_layout
 
 from ..attention_backend import AttentionMetadata
 from ..attention_backend.interface import PositionalEmbeddingParams, RopeParams
@@ -1384,14 +1384,29 @@ def split_kv_b_proj(kv_b_proj: torch.Tensor,
                                None) is not None and not dequant_kv_b_proj:
                         kv_b_proj_scale, k_b_proj_trans_scale = load_kv_b_proj_and_k_b_proj_trans(
                             name, is_scale=True)
-                        module.weight_scale.copy_(
-                            kv_b_proj_scale.reshape(module.weight_scale.shape))
                         attn_module.k_b_proj_trans_scale.copy_(
                             k_b_proj_trans_scale.reshape(
                                 attn_module.k_b_proj_trans_scale.shape))
 
-                        _, v_b_proj_scale = split_kv_b_proj(
-                            module.weight_scale.data, is_scale=True)
+                        if self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales(
+                        ) and get_sm_version() == 100:
+                            _, v_b_proj_scale = split_kv_b_proj(kv_b_proj_scale,
+                                                                is_scale=True)
+                            kv_b_proj_scale = transform_sf_into_required_layout(
+                                kv_b_proj_scale,
+                                mn=kv_b_proj.shape[0],
+                                k=kv_b_proj.shape[1],
+                                recipe=(1, 128, 128),
+                                is_sfa=False)
+                            module.weight_scale.copy_(
+                                kv_b_proj_scale.reshape(
+                                    module.weight_scale.shape))
+                        else:
+                            module.weight_scale.copy_(
+                                kv_b_proj_scale.reshape(
+                                    module.weight_scale.shape))
+                            _, v_b_proj_scale = split_kv_b_proj(
+                                module.weight_scale.data, is_scale=True)
                         attn_module.v_b_proj_scale = nn.Parameter(
                             v_b_proj_scale, requires_grad=False)
 
@@ -1432,6 +1447,14 @@ def split_kv_b_proj(kv_b_proj: torch.Tensor,
                             fused_a_scale = torch.cat(
                                 [q_a_proj_scale, fused_a_scale], dim=0)
 
+                        if self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales(
+                        ) and get_sm_version() == 100:
+                            fused_a_scale = transform_sf_into_required_layout(
+                                fused_a_scale,
+                                mn=fused_a.shape[0],
+                                k=fused_a.shape[1],
+                                recipe=(1, 128, 128),
+                                is_sfa=False)
                         module.weight_scale.data.copy_(fused_a_scale)
 
                     module.weight.data.copy_(fused_a)
@@ -1462,21 +1485,6 @@ def split_kv_b_proj(kv_b_proj: torch.Tensor,
                         for n, p in module.named_parameters():
                             p.data.copy_(module_weights[n][:])
 
-                if self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales(
-                ) and get_sm_version() == 100 and hasattr(
-                        module, "weight_scale"):
-                    weight, weight_scale = resmooth_to_fp8_e8m0(
-                        module.weight, module.weight_scale)
-                    transfromed_scale = transform_sf_into_required_layout(
-                        weight_scale,
-                        mn=weight.shape[0],
-                        k=weight.shape[1],
-                        recipe=(1, 128, 128),
-                        is_sfa=False)
-                    module.weight = nn.Parameter(weight, requires_grad=False)
-                    module.weight_scale = nn.Parameter(transfromed_scale,
-                                                       requires_grad=False)
-
         for idx, layer in enumerate(
                 self.model.layers[:self.config.num_hidden_layers]):
             if idx == self.config.num_hidden_layers - 1:
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -558,11 +558,19 @@ def create_weights(self, module: Linear, in_features: int,
         module.weight = Parameter(torch.empty(weight_shape,
                                               dtype=torch.float8_e4m3fn),
                                   requires_grad=False)
-        scale_shape = (math.ceil(out_features / 128),
-                       math.ceil(in_features / 128))
-        module.weight_scale = Parameter(torch.empty(scale_shape,
-                                                    dtype=torch.float32),
-                                        requires_grad=False)
+
+        if get_sm_version() == 100:
+            scale_shape = (math.ceil(in_features / 512),
+                           math.ceil(out_features))
+            module.weight_scale = Parameter(torch.empty(scale_shape,
+                                                        dtype=torch.int32).T,
+                                            requires_grad=False)
+        else:
+            scale_shape = (math.ceil(out_features / 128),
+                           math.ceil(in_features / 128))
+            module.weight_scale = Parameter(torch.empty(scale_shape,
+                                                        dtype=torch.float32),
+                                            requires_grad=False)
         # Not really used for Gemm now.
         # Only used to quantize output of FP8 attention.
         module.input_scale = Parameter(torch.tensor(1., dtype=torch.float32),
@@ -592,14 +600,30 @@ def apply(self, module: Linear, input: torch.Tensor,
                     module.weight_scale)
             else:
                 from tensorrt_llm import deep_gemm
-                a, a_sf = fp8_utils.per_token_quant_and_transform(input)
-                output = torch.empty((input.shape[0], module.weight.shape[0]),
-                                     device=input.device,
-                                     dtype=torch.bfloat16)
-                deep_gemm.fp8_gemm_nt((a, a_sf),
-                                      (module.weight, module.weight_scale),
-                                      output,
-                                      disable_ue8m0_cast=True)
+                if input.shape[0] < 32:
+                    # Swap AB
+                    a, a_sf = fp8_utils.per_token_quant_and_transform(
+                        input, swap_ab=True)
+                    output_padded = torch.empty(
+                        (module.weight.shape[0], a.shape[0]),
+                        device=input.device,
+                        dtype=torch.bfloat16)
+                    deep_gemm.fp8_gemm_nt((module.weight, module.weight_scale),
+                                          (a, a_sf),
+                                          output_padded,
+                                          disable_ue8m0_cast=True)
+                    output = fp8_utils.masked_transpose(output_padded,
+                                                        input.shape[0])
+                else:
+                    a, a_sf = fp8_utils.per_token_quant_and_transform(input)
+                    output = torch.empty(
+                        (input.shape[0], module.weight.shape[0]),
+                        device=input.device,
+                        dtype=torch.bfloat16)
+                    deep_gemm.fp8_gemm_nt((a, a_sf),
+                                          (module.weight, module.weight_scale),
+                                          output,
+                                          disable_ue8m0_cast=True)
         else:
             act_input_fp8, act_input_sf = torch.ops.trtllm.fp8_quantize_1x128(
                 input)
@@ -625,6 +649,13 @@ def load_weights_vanilla(self, module: Linear, weights: List[Dict]) -> None:
         weight_scale = load_weight_shard(weights[0][scale_name], module.tp_size,
                                          module.tp_rank,
                                          module.tp_mode).squeeze()
+        if get_sm_version() == 100:
+            weight_scale = fp8_utils.transform_sf_into_required_layout(
+                weight_scale,
+                mn=module.weight.shape[0],
+                k=module.weight.shape[1],
+                recipe=(1, 128, 128),
+                is_sfa=False)
         copy_weight(module.weight_scale, weight_scale)
         if "input_scale" in weights[0]:
             copy_weight(module.input_scale, weights[0]["input_scale"])
@@ -661,6 +692,13 @@ def load_weights_fused_gate_up_linear(self, module: Linear,
                                         module.tp_rank, module.tp_mode)
         fused_scale = torch.cat([left_scale, right_scale], dim=0).squeeze()
         copy_weight(module.weight, fused_weight)
+        if get_sm_version() == 100:
+            fused_scale = fp8_utils.transform_sf_into_required_layout(
+                fused_scale,
+                mn=fused_weight.shape[0],
+                k=fused_weight.shape[1],
+                recipe=(1, 128, 128),
+                is_sfa=False)
         copy_weight(module.weight_scale, fused_scale)
 
 
diff --git a/tensorrt_llm/quantization/utils/fp8_utils.py b/tensorrt_llm/quantization/utils/fp8_utils.py
@@ -448,6 +448,7 @@ def per_token_quant_and_transform(
     input: torch.Tensor,
     quant_group_size: int = 128,
     scale_ue8m0: bool = True,
+    swap_ab=False,
 ):
     """
     input shape [g, m, k]
@@ -467,18 +468,21 @@ def per_token_quant_and_transform(
     fp8_min = -fp8_max
 
     m, k = input.shape
+    m_padded = m if not swap_ab else align(m, 8)
 
     # Create output
-    output = torch.empty((m, k), dtype=torch.float8_e4m3fn, device="cuda")
+    output = torch.empty((m_padded, k),
+                         dtype=torch.float8_e4m3fn,
+                         device="cuda")
 
     # Create output scale
     alignment = 4
     scale_k = ceil_div(k, quant_group_size)
-    m_padded = align(m, alignment)
+    m_aligned = align(m_padded, alignment)
     scale_k_padded = align(scale_k, alignment)
-    output_scale = torch.zeros((scale_k_padded // 4, m_padded),
+    output_scale = torch.empty((scale_k_padded // 4, m_aligned),
                                dtype=torch.int32,
-                               device='cuda')
+                               device="cuda")
 
     # Get block/grid/stage/warp
     BLOCK_NUM_PER_EXPERT = 64
@@ -508,13 +512,67 @@ def per_token_quant_and_transform(
         num_warps=num_warps,
         SCALE_UE8M0=scale_ue8m0,
     )
-    output_scale = output_scale.transpose(0, 1)[:m, :]
+    output_scale = output_scale.transpose(0, 1)[:m_padded, :]
     check_sf_layout(
         output_scale,
-        m,
+        m_padded,
         k,
         (1, 128),
         num_groups=None,
         tma_stride_check=True,
     )
     return output, output_scale
+
+
+@triton.jit
+def _transpose_kernel(input_ptr, output_ptr, M, N, stride_in_m, stride_in_n,
+                      stride_out_m, stride_out_n, BLOCK_SIZE: tl.constexpr):
+    row_block = tl.program_id(0)
+    col_block = tl.program_id(1)
+
+    row = row_block * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    col = col_block * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+
+    mask_row = row < M
+    mask_col = col < N
+    mask = mask_row[:, None] & mask_col[None, :]
+
+    input_idx = row[:, None] * stride_in_m + col[None, :] * stride_in_n
+    data = tl.load(input_ptr + input_idx, mask=mask, other=0)
+
+    output_idx = row[:, None] * stride_out_n + col[None, :] * stride_out_m
+    tl.store(output_ptr + output_idx, data, mask=mask)
+
+
+def masked_transpose(input: torch.Tensor, n_available: int) -> torch.Tensor:
+    """
+    Perform a masked transpose operation on a 2D tensor.
+
+    Args:
+        input: Input tensor of shape (M, N)
+        n_available: Number of columns to transpose (must be <= N)
+
+    Returns:
+        Transposed tensor of shape (n_available, M)
+    """
+    M, N = input.shape
+    assert n_available <= N, "n_available must be less than or equal to N"
+    BLOCK_SIZE = 32
+    output = torch.empty((n_available, M),
+                         dtype=input.dtype,
+                         device=input.device)
+
+    grid = ((M + BLOCK_SIZE - 1) // BLOCK_SIZE,
+            (n_available + BLOCK_SIZE - 1) // BLOCK_SIZE)
+    _transpose_kernel[grid](
+        input,
+        output,
+        M,
+        n_available,
+        input.stride(0),
+        input.stride(1),
+        output.stride(0),
+        output.stride(1),
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    return output