From 66edd7454d8b9bf491035b6029b260c349b412ed Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Mon, 16 Jun 2025 15:22:23 -0400
Subject: [PATCH] [CUTLASS] Fix CUTLASS kernel build on Hopper

The cutlass kernel build on Hopper GPU was broken since #18033.
This PR fixes the issue.
---
 .../cutlass/fp16_group_gemm_runner_sm90.cuh        | 14 ++++++++++++++
 .../contrib/cutlass/fp16_group_gemm_sm90.cu        | 14 --------------
 src/runtime/contrib/cutlass/fp8_group_gemm_sm90.cu | 12 ++++++------
 3 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/src/runtime/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh b/src/runtime/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh
index 38e1beb2b8f4..246063ca0341 100644
--- a/src/runtime/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh
+++ b/src/runtime/contrib/cutlass/fp16_group_gemm_runner_sm90.cuh
@@ -57,6 +57,20 @@ inline size_t aligned(size_t value, size_t alignment = 16) {
 template <typename T>
 struct KernelTraits;
 
+template <>
+struct KernelTraits<cutlass::half_t> {
+  using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _256, _64>;  // Threadblock-level tile size
+  using ClusterShape = Shape<_2, _2, _1>;    // Shape of the threadblocks in a cluster
+};
+
+template <>
+struct KernelTraits<cutlass::bfloat16_t> {
+  using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
+  using TileShape = Shape<_128, _256, _64>;  // Threadblock-level tile size
+  using ClusterShape = Shape<_2, _2, _1>;    // Shape of the threadblocks in a cluster
+};
+
 template <typename ElementA, typename ElementB, typename ElementC,
           typename LayoutA = cutlass::layout::RowMajor,
           typename LayoutB = cutlass::layout::ColumnMajor,
diff --git a/src/runtime/contrib/cutlass/fp16_group_gemm_sm90.cu b/src/runtime/contrib/cutlass/fp16_group_gemm_sm90.cu
index 93a03a0675b2..bbf5a453b4d0 100644
--- a/src/runtime/contrib/cutlass/fp16_group_gemm_sm90.cu
+++ b/src/runtime/contrib/cutlass/fp16_group_gemm_sm90.cu
@@ -41,20 +41,6 @@ struct CutlassGroupGemm<90, ElementA, ElementB, ElementC> {
   }
 };
 
-template <>
-struct KernelTraits<cutlass::half_t> {
-  using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
-  using TileShape = Shape<_128, _256, _64>;  // Threadblock-level tile size
-  using ClusterShape = Shape<_2, _2, _1>;    // Shape of the threadblocks in a cluster
-};
-
-template <>
-struct KernelTraits<cutlass::bfloat16_t> {
-  using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative;
-  using TileShape = Shape<_128, _256, _64>;  // Threadblock-level tile size
-  using ClusterShape = Shape<_2, _2, _1>;    // Shape of the threadblocks in a cluster
-};
-
 void tvm_cutlass_group_gemm_sm90(NDArray x, NDArray weight, NDArray indptr, NDArray workspace,
                                  NDArray out) {
   tvm_cutlass_group_gemm_impl<90>(x, weight, indptr, workspace, out);
diff --git a/src/runtime/contrib/cutlass/fp8_group_gemm_sm90.cu b/src/runtime/contrib/cutlass/fp8_group_gemm_sm90.cu
index 686a6ebcffeb..0eaa6a1efb77 100644
--- a/src/runtime/contrib/cutlass/fp8_group_gemm_sm90.cu
+++ b/src/runtime/contrib/cutlass/fp8_group_gemm_sm90.cu
@@ -19,9 +19,8 @@
 
 #include <cuda_fp16.h>
 #include <float.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/ffi/function.h>
 #include <tvm/ffi/function.h>
+#include <tvm/runtime/ndarray.h>
 
 #include "fp16_group_gemm_runner_sm90.cuh"
 
@@ -60,10 +59,11 @@ void tvm_cutlass_fp8_group_gemm(NDArray x, NDArray weight, NDArray indptr, NDArr
   int n = weight->shape[1];
   int k = x->shape[1];
   const float* beta = nullptr;
-  cutlass_group_gemm(static_cast<ElementA*>(x->data), static_cast<ElementB*>(weight->data),
-                     static_cast<int64_t*>(indptr->data), static_cast<uint8_t*>(workspace->data),
-                     workspace->shape[0], n, k, num_groups, static_cast<float*>(alpha->data), beta,
-                     static_cast<ElementC*>(out->data), stream);
+  cutlass_group_gemm_sm90(static_cast<ElementA*>(x->data), static_cast<ElementB*>(weight->data),
+                          static_cast<int64_t*>(indptr->data),
+                          static_cast<uint8_t*>(workspace->data), workspace->shape[0], n, k,
+                          num_groups, static_cast<float*>(alpha->data), beta,
+                          static_cast<ElementC*>(out->data), stream);
 }
 
 TVM_FFI_REGISTER_GLOBAL("cutlass.group_gemm_e5m2_e5m2_fp16")