[mxfp8 moe training] add per group blocked scale kernels

danielvegamyhre · danielvegamyhre · commit c66c5c06b758 · 2025-08-26T19:57:51.000-07:00
stack-info: PR: #2886, branch: danielvegamyhre/stack/62
diff --git a/test/prototype/moe_training/test_kernels.py b/test/prototype/moe_training/test_kernels.py
@@ -7,6 +7,8 @@
 import pytest
 import torch
 
+from torchao.prototype.mx_formats.utils import to_blocked_per_group_2d
+
 # We need to skip before doing any imports which would use triton, since
 # triton won't be available on CPU builds
 if not (torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 9):
@@ -20,12 +22,17 @@
     triton_fp8_per_group_colwise_scales,
     triton_fp8_per_group_rowwise_scales,
 )
+from torchao.prototype.moe_training.kernels.mxfp8_blocked_scales import (
+    triton_mx_block_rearrange_per_group,
+)
 from torchao.prototype.moe_training.utils import (
     _is_column_major,
+    generate_jagged_offs,
     torch_to_3d_rowwise_float8_transpose_rhs,
     torch_to_float8_per_group_colwise,
     torch_to_float8_per_group_rowwise,
 )
+from torchao.prototype.mx_formats.mx_tensor import to_mx
 from torchao.testing.utils import skip_if_rocm
 
 
@@ -118,3 +125,28 @@ def test_fp8_rowwise_3d_transpose_rhs(round_scales_to_power_of_2: bool):
     assert ref_fp8.shape == triton_fp8.shape, "output shapes not equal"
     assert ref_fp8.stride() == triton_fp8.stride(), "output strides not equal"
     assert torch.allclose(ref_fp8, triton_fp8, rtol=0, atol=0), "fp8 data not equal"
+
+
+@skip_if_rocm("ROCm enablement in progress")
+@pytest.mark.parametrize("round_scales_to_power_of_2", [True, False])
+@pytest.mark.parametrize("m,k,n_groups", [(256, 256, 4)])
+def test_mxfp8_per_group_blocked_scales_2d(
+    m: int, k: int, n_groups: int, round_scales_to_power_of_2: bool
+):
+    device = "cuda"
+    block_size = 32
+    input_data = torch.randn(m, k, device=device)
+    e8m0_scales, _ = to_mx(
+        input_data, elem_dtype=torch.float8_e4m3fn, block_size=block_size
+    )
+    offs = generate_jagged_offs(n_groups, m, multiple_of=block_size, device=device)
+
+    # torch reference
+    ref_out = to_blocked_per_group_2d(e8m0_scales, offs, m, k, block_size=block_size)
+
+    # triton kernel
+    triton_out = triton_mx_block_rearrange_per_group(e8m0_scales, offs)
+
+    assert torch.testing.allclose(ref_out, triton_out, atol=0, rtol=0), (
+        "blocked scales not equal"
+    )
diff --git a/torchao/prototype/moe_training/kernels/__init__.py b/torchao/prototype/moe_training/kernels/__init__.py
@@ -7,6 +7,6 @@
 from torchao.prototype.moe_training.kernels.jagged_float8_scales import (
     triton_fp8_per_group_rowwise_scales as triton_fp8_per_group_rowwise_scales,
 )
-from torchao.prototype.moe_training.kernels.mxfp8 import (
+from torchao.prototype.moe_training.kernels.mxfp8_gemms import (
     fbgemm_mxfp8_grouped_mm_2d_3d as fbgemm_mxfp8_grouped_mm_2d_3d,
 )
diff --git a/torchao/prototype/moe_training/kernels/mxfp8_blocked_scales.py b/torchao/prototype/moe_training/kernels/mxfp8_blocked_scales.py
@@ -0,0 +1,249 @@
+import torch
+import triton
+import triton.language as tl
+from torch import Tensor
+from torch.library import triton_op, wrap_triton
+
+from torchao.utils import ceil_div
+
+
+def to_blocked_per_group_2d(
+    x_scales: Tensor, group_offs: Tensor, Mg: int, K: int, block_size: int = 32
+) -> Tensor:
+    """
+    Convert scales to blocked format for a 2D tensor (input activations / token groups)
+
+    Args:
+        x_scales: Tensor with per group scales in blocked format concatenated into one tensor.
+        group_offs: Tensor of shape (num_groups,) which contains the end index of each group along the Mg dimension.
+        Mg: total size of all groups summed together
+        K: K dim size
+
+    Returns:
+        blocked_scales: Tensor
+        start_row_after_padding: Tensor of shape (num_groups,) which contains the start row after padding for each group.
+    """
+    from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import _to_blocked
+
+    assert x_scales.ndim == 2, "x_scales must be 2D"
+    assert block_size == 32, "Only block_size=32 is supported for now"
+    blocked_scales_list = []
+    start_row_after_padding_list = [0]
+    group_start_idx = 0
+    for i, group_end_idx in enumerate(group_offs.tolist()):
+        group_size = group_end_idx - group_start_idx
+        prev_start_row_after_padding = start_row_after_padding_list[i]
+        if group_size == 0:
+            start_row_after_padding_list.append(prev_start_row_after_padding)
+            continue
+
+        # Convert group scales to blocked format
+        group_scales = x_scales[group_start_idx:group_end_idx]
+        group_scales_blocked = _to_blocked(group_scales)
+        blocked_scales_list.append(group_scales_blocked)
+
+        # Calculate the start row after padding
+        scaling_groups_per_row = K // block_size
+        rows_for_group = group_scales_blocked.numel() // scaling_groups_per_row
+        new_start_row = prev_start_row_after_padding + rows_for_group
+        start_row_after_padding_list.append(new_start_row)
+
+        # Update next group start index
+        group_start_idx = group_end_idx
+
+    blocked_scales = torch.cat(blocked_scales_list, dim=0).contiguous()
+    blocked_scales = blocked_scales.reshape(-1, K // 32)
+    start_row_after_padding = torch.tensor(
+        start_row_after_padding_list, device=x_scales.device, dtype=torch.int64
+    )
+    return blocked_scales, start_row_after_padding
+
+
+def to_blocked_per_group_3d(weight_scales: Tensor) -> Tensor:
+    """
+    Convert scales to blocked format for each group for a 3D tensor (expert weights)
+
+    Args:
+        scales: Tensor of shape (E, N, K//block_size)
+        group_offs: Tensor of shape (num_groups,) which contains the end index of each group along the
+    """
+    from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import _to_blocked
+
+    blocked_scales_list = []
+    num_groups = weight_scales.shape[0]
+    for i in range(num_groups):
+        group_scales = weight_scales[i]
+        group_scales_blocked = _to_blocked(group_scales)
+        blocked_scales_list.append(group_scales_blocked)
+    weight_scales_blocked = torch.stack(blocked_scales_list, dim=0).contiguous()
+    weight_scales_blocked = weight_scales_blocked.reshape(num_groups, -1)
+    return weight_scales_blocked
+
+
+def compute_per_group_blocked_scale_offsets(offsets: torch.Tensor):
+    """
+    Rounds each integer in a 1D PyTorch tensor up to the nearest multiple of 128.
+
+    Args:
+    offsets: A 1D PyTorch tensor of integers in ascending sorted order, representing the end index of each group along the Mg dimension.
+
+    Returns:
+        - group_sizes: A 1D PyTorch tensor of integers representing the size of each group.
+        - starting_row_after_padding: 1D integer tensor representing the starting row after padding each to blocked format.
+    """
+    # Calculate group sizes
+    zero = torch.tensor([0], dtype=offsets.dtype, device=offsets.device)
+    group_sizes = torch.diff(offsets, prepend=zero).to(torch.int64)
+
+    # Round each group size up to the nearest multiple of 128
+    rounded_group_sizes = ceil_div(group_sizes, 128) * 128
+
+    # Calculate the starting row after padding for each group
+    starting_row_after_padding = torch.cumsum(rounded_group_sizes, dim=0)
+    return group_sizes, starting_row_after_padding
+
+
+@triton_op("torchao::triton_mx_block_rearrange_per_group", mutates_args=())
+def triton_mx_block_rearrange_per_group(
+    scales_tensor: torch.Tensor,
+    offsets: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Rearranges an E8M0 tensor scale to block-scaled swizzle format.
+
+    This format is suitable for Tmem as described in NVIDIA documentation:
+    https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout
+
+    Args:
+        scales_tensor: Input tensor containing e8m0 scales for each logical group of a target tensor.
+        blocked_scale_group_offsets: Tensor of shape (num_groups,) which contains the pre-computed start index of each group along the M dimension.
+
+    Returns:
+        Rearranged tensor in block-scaled swizzle format
+    """
+    assert scales_tensor.element_size() == 1, (
+        "Expected element size to be 1 byte (8 bits)"
+    )
+    _, output_scales_group_offsets = compute_per_group_blocked_scale_offsets(offsets)
+    rows, cols = scales_tensor.shape
+
+    # Calculate blocks needed
+    num_groups = output_scales_group_offsets.numel()
+    padded_rows = output_scales_group_offsets[
+        -1
+    ]  # Final offset is the total number of rows in the tensor
+    num_col_blocks = ceil_div(cols, 4)
+    padded_cols = num_col_blocks * 4
+    out = scales_tensor.new_empty((padded_rows, padded_cols))
+
+    # We probably want handle multiple blocks per tile but for now keep it simple
+    BLOCK_ROWS, BLOCK_COLS = 128, 4
+
+    # Output block stride for the rearranged format
+    output_stride_per_row_of_blocks = (
+        BLOCK_ROWS * BLOCK_COLS * (padded_cols // BLOCK_COLS)
+    )
+
+    # We parallelize per group and per col block.
+    # Rows per group is variable so we just loop through row blocks per group, per col block.
+    grid = lambda META: (
+        num_groups,
+        num_col_blocks,
+    )
+
+    wrap_triton(triton_scale_swizzle_per_group)[grid](
+        # Input scales
+        scales_tensor.view(torch.uint8),
+        scales_tensor.stride(0),
+        scales_tensor.stride(1),
+        rows,
+        cols,
+        num_groups,
+        # Original offsets (to read from)
+        offsets,
+        # Output scales tensor and group offsets after padding (to write to)
+        out.view(torch.uint8),
+        output_scales_group_offsets,
+        output_stride_per_row_of_blocks,
+        BLOCK_ROWS=BLOCK_ROWS,
+        BLOCK_COLS=BLOCK_COLS,
+    )
+    return out
+
+
+@triton.jit
+def triton_scale_swizzle_per_group(
+    scales_ptr,  # (M, K//block_size)
+    scales_stride_dim0,
+    scales_stride_dim1,
+    scale_rows,
+    scale_cols,
+    num_groups,
+    orig_offsets,  # (num_groups,)
+    output_scales_ptr,  # (rows + num_groups * 128, tl.cdiv(K, 4) * 4)
+    output_scales_group_offsets,  # (num_groups,)
+    output_stride_per_row_of_blocks,
+    BLOCK_ROWS: tl.constexpr,
+    BLOCK_COLS: tl.constexpr,
+):
+    group_pid = tl.program_id(0)
+    block_col_pid = tl.program_id(1)
+
+    row_offs = tl.arange(0, BLOCK_ROWS)[:, None]
+    col_offs = tl.arange(0, BLOCK_COLS)[None, :]
+
+    # Row range for this group
+    input_start_row = tl.load(orig_offsets + group_pid - 1, mask=group_pid > 0, other=0)
+    input_end_row = tl.load(
+        orig_offsets + group_pid, mask=group_pid < num_groups, other=0
+    )
+
+    # Base offset in the output scales tensor we will write to
+    output_row_start_offset = tl.load(
+        output_scales_group_offsets + group_pid, mask=group_pid < num_groups, other=0
+    )
+    output_row_end_offset = tl.load(
+        output_scales_group_offsets + group_pid + 1,
+        mask=group_pid < num_groups,
+        other=0,
+    )
+
+    # For this group and col block, we iterate through blocks, reading (BLOCK_ROWS, BLOCK_COLS) from the input scales.
+    # We need to track how many row blocks we iterated through.
+    block_row_id = 0
+    for row_off in tl.range(
+        input_start_row, tl.cdiv(input_end_row, BLOCK_ROWS) * BLOCK_ROWS, BLOCK_ROWS
+    ):
+        # Read block of input scales
+        block_row_offs = row_off + row_offs[:, None]
+        block_col_offs = block_col_pid * BLOCK_COLS + col_offs[None, :]
+        block_offs = block_row_offs + block_col_offs
+        mask = (block_row_offs < input_end_row) & (block_col_offs < scale_cols)
+        input_scales = tl.load(scales_ptr + block_offs, mask=mask, other=0.0)
+
+        # Calculate destination indices for each row and col
+        r_div_32 = scale_rows // 32
+        r_mod_32 = scale_rows % 32
+
+        # Rearrange to (32, 4, 4) then to final (32, 16) coordinates
+        dest_indices = r_mod_32 * 16 + r_div_32 * 4 + scale_cols
+
+        # Flatten
+        dest_indices_flat = tl.reshape(dest_indices, (BLOCK_ROWS * BLOCK_COLS))
+        scales_flat = tl.reshape(input_scales, (BLOCK_ROWS * BLOCK_COLS))
+
+        # Calculate block offset using provided output block stride
+        output_block_offsets = (
+            output_row_start_offset
+            + (block_row_id * output_stride_per_row_of_blocks)
+            + (block_col_pid * BLOCK_COLS)
+        )
+
+        tl.store(
+            output_scales_ptr + output_block_offsets + dest_indices_flat,
+            scales_flat,
+            mask=output_block_offsets < output_row_end_offset,
+        )
+
+        # Update row block id to next block
+        block_row_id += 1
diff --git a/torchao/prototype/moe_training/kernels/mxfp8_gemms.py b/torchao/prototype/moe_training/kernels/mxfp8_gemms.py
@@ -19,6 +19,8 @@
         "If errors persist, please file a bug report."
     )
 
+DEBUG = False
+
 
 @torch.library.custom_op("torchao::fbgemm_mxfp8_grouped_mm_2d_3d", mutates_args={})
 def fbgemm_mxfp8_grouped_mm_2d_3d(
@@ -108,6 +110,9 @@ def _log_inputs(
     group_sizes: torch.Tensor,
     starting_row_after_padding: torch.Tensor,
 ):
+    if not DEBUG:
+        return
+
     logger.info(f"offs: {offs}, dtype: {offs.dtype}")
     logger.info(
         f"A_fp8.shape: {A_fp8.shape}, stride: {A_fp8.stride()}, dtype: {A_fp8.dtype}"

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,6 @@`
`7`	`7`	`from torchao.prototype.moe_training.kernels.jagged_float8_scales import (`
`8`	`8`	`triton_fp8_per_group_rowwise_scales as triton_fp8_per_group_rowwise_scales,`
`9`	`9`	`)`
`10`		`-from torchao.prototype.moe_training.kernels.mxfp8 import (`
	`10`	`+from torchao.prototype.moe_training.kernels.mxfp8_gemms import (`
`11`	`11`	`fbgemm_mxfp8_grouped_mm_2d_3d as fbgemm_mxfp8_grouped_mm_2d_3d,`
`12`	`12`	`)`