[mxfp8 moe training] add per group blocked scale kernels

danielvegamyhre · danielvegamyhre · commit 75ae9d63b1de · 2025-08-27T08:42:44.000-07:00
stack-info: PR: #2886, branch: danielvegamyhre/stack/62
diff --git a/test/prototype/moe_training/test_kernels.py b/test/prototype/moe_training/test_kernels.py
@@ -7,6 +7,8 @@
 import pytest
 import torch
 
+from torchao.prototype.mx_formats.utils import to_blocked_per_group_2d
+
 # We need to skip before doing any imports which would use triton, since
 # triton won't be available on CPU builds
 if not (torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 9):
@@ -20,12 +22,17 @@
     triton_fp8_per_group_colwise_scales,
     triton_fp8_per_group_rowwise_scales,
 )
+from torchao.prototype.moe_training.kernels.mxfp8_blocked_scales import (
+    triton_mx_block_rearrange_per_group,
+)
 from torchao.prototype.moe_training.utils import (
     _is_column_major,
+    generate_jagged_offs,
     torch_to_3d_rowwise_float8_transpose_rhs,
     torch_to_float8_per_group_colwise,
     torch_to_float8_per_group_rowwise,
 )
+from torchao.prototype.mx_formats.mx_tensor import to_mx
 from torchao.testing.utils import skip_if_rocm
 
 
@@ -118,3 +125,35 @@ def test_fp8_rowwise_3d_transpose_rhs(round_scales_to_power_of_2: bool):
     assert ref_fp8.shape == triton_fp8.shape, "output shapes not equal"
     assert ref_fp8.stride() == triton_fp8.stride(), "output strides not equal"
     assert torch.allclose(ref_fp8, triton_fp8, rtol=0, atol=0), "fp8 data not equal"
+
+
+@skip_if_rocm("ROCm enablement in progress")
+@pytest.mark.parametrize("m,k,n_groups", [(256, 256, 4), (16640, 5120, 16)])
+def test_mxfp8_per_group_blocked_scales_2d(
+    m: int,
+    k: int,
+    n_groups: int,
+):
+    device = "cuda"
+    block_size = 32
+    input_data = torch.randn(m, k, device=device)
+    e8m0_scales, _ = to_mx(
+        input_data, elem_dtype=torch.float8_e4m3fn, block_size=block_size
+    )
+    offs = generate_jagged_offs(n_groups, m, multiple_of=block_size, device=device)
+
+    # torch reference
+    ref_out_scales, ref_group_offsets = to_blocked_per_group_2d(
+        e8m0_scales, offs, m, k, block_size=block_size
+    )
+
+    # triton kernel
+    triton_out_scales, triton_group_offsets = triton_mx_block_rearrange_per_group(
+        e8m0_scales, offs
+    )
+    assert torch.allclose(ref_group_offsets, triton_group_offsets, atol=0, rtol=0), (
+        "group offsets not equal"
+    )
+    assert torch.allclose(ref_out_scales, triton_out_scales, atol=0, rtol=0), (
+        "blocked scales not equal"
+    )
diff --git a/torchao/prototype/moe_training/kernels/__init__.py b/torchao/prototype/moe_training/kernels/__init__.py
@@ -7,6 +7,6 @@
 from torchao.prototype.moe_training.kernels.jagged_float8_scales import (
     triton_fp8_per_group_rowwise_scales as triton_fp8_per_group_rowwise_scales,
 )
-from torchao.prototype.moe_training.kernels.mxfp8 import (
+from torchao.prototype.moe_training.kernels.mxfp8_gemms import (
     fbgemm_mxfp8_grouped_mm_2d_3d as fbgemm_mxfp8_grouped_mm_2d_3d,
 )
diff --git a/torchao/prototype/moe_training/kernels/mxfp8_blocked_scales.py b/torchao/prototype/moe_training/kernels/mxfp8_blocked_scales.py
@@ -0,0 +1,253 @@
+import torch
+import triton
+import triton.language as tl
+from torch import Tensor
+
+from torchao.utils import ceil_div
+
+
+def to_blocked_per_group_2d(
+    x_scales: Tensor, group_offs: Tensor, Mg: int, K: int, block_size: int = 32
+) -> Tensor:
+    """
+    Convert scales to blocked format for a 2D tensor (input activations / token groups)
+
+    Args:
+        x_scales: Tensor with per group scales in blocked format concatenated into one tensor.
+        group_offs: Tensor of shape (num_groups,) which contains the end index of each group along the Mg dimension.
+        Mg: total size of all groups summed together
+        K: K dim size
+
+    Returns:
+        blocked_scales: Tensor
+        start_row_after_padding: Tensor of shape (num_groups,) which contains the start row after padding for each group.
+    """
+    from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import _to_blocked
+
+    assert x_scales.ndim == 2, "x_scales must be 2D"
+    assert block_size == 32, "Only block_size=32 is supported for now"
+    blocked_scales_list = []
+    start_row_after_padding_list = [0]
+    group_start_idx = 0
+    for i, group_end_idx in enumerate(group_offs.tolist()):
+        group_size = group_end_idx - group_start_idx
+        prev_start_row_after_padding = start_row_after_padding_list[i]
+        if group_size == 0:
+            start_row_after_padding_list.append(prev_start_row_after_padding)
+            continue
+
+        # Convert group scales to blocked format
+        group_scales = x_scales[group_start_idx:group_end_idx]
+        group_scales_blocked = _to_blocked(group_scales)
+        blocked_scales_list.append(group_scales_blocked)
+
+        # Calculate the start row after padding
+        scaling_groups_per_row = K // block_size
+        rows_for_group = group_scales_blocked.numel() // scaling_groups_per_row
+        new_start_row = prev_start_row_after_padding + rows_for_group
+        start_row_after_padding_list.append(new_start_row)
+
+        # Update next group start index
+        group_start_idx = group_end_idx
+
+    blocked_scales = torch.cat(blocked_scales_list, dim=0).contiguous()
+    blocked_scales = blocked_scales.reshape(-1, K // 32)
+    start_row_after_padding = torch.tensor(
+        start_row_after_padding_list, device=x_scales.device, dtype=torch.int64
+    )
+    return blocked_scales, start_row_after_padding
+
+
+def to_blocked_per_group_3d(weight_scales: Tensor) -> Tensor:
+    """
+    Convert scales to blocked format for each group for a 3D tensor (expert weights)
+
+    Args:
+        scales: Tensor of shape (E, N, K//block_size)
+        group_offs: Tensor of shape (num_groups,) which contains the end index of each group along the
+    """
+    from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import _to_blocked
+
+    blocked_scales_list = []
+    num_groups = weight_scales.shape[0]
+    for i in range(num_groups):
+        group_scales = weight_scales[i]
+        group_scales_blocked = _to_blocked(group_scales)
+        blocked_scales_list.append(group_scales_blocked)
+    weight_scales_blocked = torch.stack(blocked_scales_list, dim=0).contiguous()
+    weight_scales_blocked = weight_scales_blocked.reshape(num_groups, -1)
+    return weight_scales_blocked
+
+
+def compute_per_group_blocked_scale_offsets(offsets: torch.Tensor):
+    """
+    Rounds each integer in a 1D PyTorch tensor up to the nearest multiple of 128.
+
+    Args:
+    offsets: A 1D PyTorch tensor of integers in ascending sorted order, representing the end index of each group along the Mg dimension.
+
+    Returns:
+        - group_sizes: A 1D PyTorch tensor of integers representing the size of each group.
+        - starting_row_after_padding: 1D integer tensor representing the starting row after padding each to blocked format.
+    """
+    # Calculate group sizes
+    zero = torch.tensor([0], dtype=offsets.dtype, device=offsets.device)
+    group_sizes = torch.diff(offsets, prepend=zero).to(torch.int64)
+
+    # Round each group size up to the nearest multiple of 128
+    rounded_group_sizes = ceil_div(group_sizes, 128) * 128
+
+    # Calculate the starting row after padding for each group
+    starting_row_after_padding = torch.cumsum(rounded_group_sizes, dim=0)
+
+    # Must start with 0
+    starting_row_after_padding = torch.cat([zero, starting_row_after_padding])
+    return group_sizes, starting_row_after_padding
+
+
+# @triton_op("torchao::triton_mx_block_rearrange_per_group", mutates_args=())
+def triton_mx_block_rearrange_per_group(
+    scales_tensor: torch.Tensor,
+    offsets: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Rearranges an E8M0 tensor scale to block-scaled swizzle format.
+
+    This format is suitable for Tmem as described in NVIDIA documentation:
+    https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout
+
+    Args:
+        scales_tensor: Input tensor containing e8m0 scales for each logical group of a target tensor.
+        blocked_scale_group_offsets: Tensor of shape (num_groups,) which contains the pre-computed start index of each group along the M dimension.
+
+    Returns:
+        Rearranged tensor in block-scaled swizzle format
+    """
+    assert scales_tensor.element_size() == 1, (
+        "Expected element size to be 1 byte (8 bits)"
+    )
+    _, output_scales_group_offsets = compute_per_group_blocked_scale_offsets(offsets)
+    rows, cols = scales_tensor.shape
+
+    # Calculate blocks needed
+    num_groups = output_scales_group_offsets.numel()
+
+    # Final offset is the total number of rows in the tensor
+    padded_rows = output_scales_group_offsets[-1]
+    num_col_blocks = ceil_div(cols, 4)
+    padded_cols = num_col_blocks * 4
+    out = scales_tensor.new_empty((padded_rows, padded_cols))
+
+    # We probably want handle multiple blocks per tile but for now keep it simple
+    BLOCK_ROWS, BLOCK_COLS = 128, 4
+
+    # Output block stride for the rearranged format
+    output_stride_per_block = BLOCK_ROWS * BLOCK_COLS
+    output_stride_per_row_of_blocks = (
+        BLOCK_ROWS * BLOCK_COLS * (padded_cols // BLOCK_COLS)
+    )
+
+    # We parallelize per group and per col block.
+    # Rows per group is variable so we just loop through row blocks per group, per col block.
+    grid = lambda META: (
+        num_groups,
+        num_col_blocks,
+    )
+
+    triton_scale_swizzle_per_group[grid](
+        # Input scales
+        scales_tensor.view(torch.uint8),
+        scales_tensor.stride(0),
+        scales_tensor.stride(1),
+        rows,
+        cols,
+        num_groups,
+        # Original offsets (to read from)
+        offsets,
+        # Output scales tensor and group offsets after padding (to write to)
+        out.view(torch.uint8),
+        output_scales_group_offsets,
+        output_stride_per_block,
+        output_stride_per_row_of_blocks,
+        BLOCK_ROWS=BLOCK_ROWS,
+        BLOCK_COLS=BLOCK_COLS,
+    )
+    return out, output_scales_group_offsets
+
+
+@triton.jit
+def triton_scale_swizzle_per_group(
+    scales_ptr,  # (M, K//block_size)
+    scales_stride_dim0,
+    scales_stride_dim1,
+    scale_rows,
+    scale_cols,
+    num_groups,
+    orig_offsets,  # (num_groups,)
+    output_scales_ptr,  # (rows + num_groups * 128, tl.cdiv(K, 4) * 4)
+    output_scales_group_offsets,  # (num_groups,)
+    output_stride_per_block,
+    output_stride_per_row_of_blocks,
+    BLOCK_ROWS: tl.constexpr,
+    BLOCK_COLS: tl.constexpr,
+):
+    group_pid = tl.program_id(0)
+    block_col_pid = tl.program_id(1)
+
+    # Input scales row range for this group
+    input_group_start_row = tl.load(
+        orig_offsets + group_pid - 1, mask=group_pid > 0, other=0
+    )
+    input_group_end_row = tl.load(
+        orig_offsets + group_pid, mask=group_pid < num_groups, other=0
+    )
+
+    # Output scales start row we will begin writing to
+    output_group_start_row = tl.load(
+        output_scales_group_offsets + group_pid, mask=group_pid < num_groups, other=0
+    )
+
+    # Calculate destination indices for each row and col in block swizzled layout.
+    # We can reuse this swizzle transformation on each block of data we read.
+    row_offs = tl.arange(0, BLOCK_ROWS)[:, None]
+    col_offs = tl.arange(0, BLOCK_COLS)[None, :]
+    r_div_32 = row_offs // 32
+    r_mod_32 = row_offs % 32
+
+    # Rearrange to (32, 4, 4) then to final (32, 16) coordinates
+    dest_indices = r_mod_32 * 16 + r_div_32 * 4 + col_offs
+
+    # Flatten
+    dest_indices_flat = tl.reshape(dest_indices, (BLOCK_ROWS * BLOCK_COLS))
+
+    # For this group and col block, we iterate through row blocks, reading (BLOCK_ROWS, BLOCK_COLS) from the input scales.
+    # We track how many row blocks we have iterated through.
+    block_row_id = 0
+    current_start_row = input_group_start_row
+    while current_start_row < input_group_end_row:
+        # Read block of input scales
+        block_row_offs = current_start_row + row_offs
+        block_col_offs = block_col_pid * BLOCK_COLS + col_offs
+        block_offs = (
+            block_row_offs * scales_stride_dim0 + block_col_offs * scales_stride_dim1
+        )
+        mask = (block_row_offs < input_group_end_row) & (block_col_offs < scale_cols)
+        input_scales = tl.load(scales_ptr + block_offs, mask=mask, other=0.0)
+        scales_flat = tl.reshape(input_scales, (BLOCK_ROWS * BLOCK_COLS))
+
+        # Calculate block offset using provided output block stride
+        output_block_offsets = (
+            output_group_start_row * scale_cols
+            + (block_row_id * output_stride_per_row_of_blocks)
+            + (block_col_pid * output_stride_per_block)
+        )
+
+        # Apply swizzling for write to gmem
+        tl.store(
+            output_scales_ptr + output_block_offsets + dest_indices_flat,
+            scales_flat,
+        )
+
+        # Update row block id to next block
+        block_row_id += 1
+        current_start_row += BLOCK_ROWS
diff --git a/torchao/prototype/moe_training/kernels/mxfp8_gemms.py b/torchao/prototype/moe_training/kernels/mxfp8_gemms.py
@@ -19,6 +19,8 @@
         "If errors persist, please file a bug report."
     )
 
+DEBUG = False
+
 
 @torch.library.custom_op("torchao::fbgemm_mxfp8_grouped_mm_2d_3d", mutates_args={})
 def fbgemm_mxfp8_grouped_mm_2d_3d(
@@ -108,6 +110,9 @@ def _log_inputs(
     group_sizes: torch.Tensor,
     starting_row_after_padding: torch.Tensor,
 ):
+    if not DEBUG:
+        return
+
     logger.info(f"offs: {offs}, dtype: {offs.dtype}")
     logger.info(
         f"A_fp8.shape: {A_fp8.shape}, stride: {A_fp8.stride()}, dtype: {A_fp8.dtype}"

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,6 @@`
`7`	`7`	`from torchao.prototype.moe_training.kernels.jagged_float8_scales import (`
`8`	`8`	`triton_fp8_per_group_rowwise_scales as triton_fp8_per_group_rowwise_scales,`
`9`	`9`	`)`
`10`		`-from torchao.prototype.moe_training.kernels.mxfp8 import (`
	`10`	`+from torchao.prototype.moe_training.kernels.mxfp8_gemms import (`
`11`	`11`	`fbgemm_mxfp8_grouped_mm_2d_3d as fbgemm_mxfp8_grouped_mm_2d_3d,`
`12`	`12`	`)`