bugfix: fix blackwell fmha hanging issue for empty kv_len (#1198)

yzh119 · web-flow · commit 3b01face12e6 · 2025-07-06T15:45:23.000-07:00
## 📌 Description Cherry picked from cutlass v4.0 changes. ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes cc @pavanimajety
diff --git a/include/flashinfer/attention/blackwell/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp b/include/flashinfer/attention/blackwell/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp
@@ -40,6 +40,7 @@ namespace cutlass::fmha::collective {
 
 template <class Element, class ElementAcc, class TileShape>
 struct Sm100FmhaFwdEpilogueTmaWarpspecialized {
+  using ElementOut = Element;
   using Pipeline = cutlass::PipelineAsync<2>;
   // using ShapeT = cute::Shape<int32_t, int32_t, cute::Shape<int32_t, int32_t>>;
   // using StrideO = cute::Shape<int32_t, _1, cute::Shape<int32_t, int32_t>>;
diff --git a/include/flashinfer/attention/blackwell/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp b/include/flashinfer/attention/blackwell/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp
@@ -1088,6 +1088,72 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized {
     pipeline_epi.producer_commit(pipeline_epi_producer_state);
     ++pipeline_epi_producer_state;
   }
+
+  template <class BlkCoord, class ProblemShape, class ParamsProblemShape, class TensorStorageEpi,
+            class CollectiveEpilogue>
+  CUTLASS_DEVICE auto correction_empty(
+      BlkCoord const& blk_coord, Params const& params, ProblemShape const& problem_shape,
+      ParamsProblemShape const& params_problem_shape, TensorStorageEpi& shared_storage_epi,
+      PipelineE& pipeline_epi, typename PipelineE::PipelineState& pipeline_epi_producer_state,
+      CollectiveEpilogue& epilogue) {
+    pipeline_epi.producer_acquire(pipeline_epi_producer_state);
+
+    Tensor sO = make_tensor(make_smem_ptr(shared_storage_epi.smem_o.data()),
+                            typename TensorStorageEpi::SmemLayoutO{});
+    Tensor gLSE = make_tensor(make_gmem_ptr(epilogue.params.ptr_LSE), epilogue.params.layout_LSE);
+    int thread_idx = threadIdx.x % (4 * NumThreadsPerWarp);
+
+    using ElementOut = typename CollectiveEpilogue::ElementOut;
+    auto tiled_copy = make_cotiled_copy(
+        Copy_Atom<UniversalCopy<uint32_t>, ElementOut>{},
+        make_ordered_layout(make_shape(_128{}, Int<sizeof(uint32_t) / sizeof(ElementOut)>{}),
+                            Step<_1, _0>{}),
+        sO.layout());
+
+    auto thr_copy = tiled_copy.get_slice(thread_idx);
+    auto tOgO = thr_copy.partition_D(sO);
+    auto tOrO = make_tensor<ElementOut>(shape(tOgO(_, _, _, _0{})));
+    clear(tOrO);
+
+    copy(tiled_copy, tOrO, tOgO(_, _, _, _0{}));
+
+    if (epilogue.params.ptr_LSE != nullptr) {
+      int qo_tile_idx = get<0>(blk_coord);
+      int qo_head_idx = get<2, 0>(blk_coord);
+      int batch_idx = get<2, 1>(blk_coord);
+      int qo_len = get<0>(problem_shape);
+      int segment_offset = get<0>(params_problem_shape).segment_offsets[batch_idx];
+      int row_idx = thread_idx + get<0>(TileShape{}) * qo_tile_idx;
+
+      if (row_idx < qo_len) {
+        gLSE(segment_offset + row_idx, qo_head_idx) = -cuda::std::numeric_limits<float>::infinity();
+      }
+    }
+
+    pipeline_epi.producer_commit(pipeline_epi_producer_state);
+    ++pipeline_epi_producer_state;
+
+    copy(tiled_copy, tOrO, tOgO(_, _, _, _1{}));
+    cutlass::arch::fence_view_async_shared();
+    pipeline_epi.producer_acquire(pipeline_epi_producer_state);
+
+    if (epilogue.params.ptr_LSE != nullptr) {
+      int qo_tile_idx = get<0>(blk_coord);
+      int qo_head_idx = get<2, 0>(blk_coord);
+      int batch_idx = get<2, 1>(blk_coord);
+      int qo_len = get<0>(problem_shape);
+      int segment_offset = get<0>(params_problem_shape).segment_offsets[batch_idx];
+      int row_idx = thread_idx + get<0>(TileShape{}) * qo_tile_idx + get<0>(TileShapeQK{});
+
+      if (row_idx < qo_len) {
+        gLSE(segment_offset + row_idx, qo_head_idx) = -cuda::std::numeric_limits<float>::infinity();
+      }
+    }
+
+    cutlass::arch::fence_view_async_shared();
+    pipeline_epi.producer_commit(pipeline_epi_producer_state);
+    ++pipeline_epi_producer_state;
+  }
 };
 
 }  // namespace cutlass::fmha::collective
diff --git a/include/flashinfer/attention/blackwell/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp b/include/flashinfer/attention/blackwell/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp
@@ -380,6 +380,10 @@ struct Sm100FmhaFwdKernelTmaWarpspecialized {
           continue;
         }
 
+        if (get<1>(logical_problem_shape) == 0) {  // kv_len == 0
+          continue;
+        }
+
         bool is_softmax_0 = role == WarpRole::Softmax0;
 
         mainloop.softmax(
@@ -404,6 +408,13 @@ struct Sm100FmhaFwdKernelTmaWarpspecialized {
           continue;
         }
 
+        if (get<1>(logical_problem_shape) == 0) {  // kv_len == 0
+          mainloop.correction_empty(blk_coord, params.mainloop, logical_problem_shape,
+                                    params.problem_shape, shared_storage.epilogue,
+                                    pipeline_corr_epi, pipeline_corr_epi_producer_state, epilogue);
+          continue;
+        }
+
         mainloop.correction(blk_coord, params.mainloop, params.problem_shape, logical_problem_shape,
                             shared_storage.epilogue, pipeline_s0_corr,
                             pipeline_s0_corr_consumer_state, pipeline_s1_corr,
@@ -437,6 +448,10 @@ struct Sm100FmhaFwdKernelTmaWarpspecialized {
           continue;
         }
 
+        if (get<1>(logical_problem_shape) == 0) {  // kv_len == 0
+          continue;
+        }
+
         mainloop.mma(
             blk_coord, params.mainloop, logical_problem_shape, shared_storage.mainloop,
             pipeline_load_q, pipeline_load_q_consumer_state, pipeline_load_k,
@@ -461,6 +476,11 @@ struct Sm100FmhaFwdKernelTmaWarpspecialized {
           continue;
         }
 
+        if (get<1>(logical_problem_shape) == 0) {  // kv_len == 0
+          work_idx++;
+          continue;
+        }
+
         mainloop.load(blk_coord, logical_problem_shape, params.mainloop, params.problem_shape,
                       shared_storage.mainloop, pipeline_load_q, pipeline_load_q_producer_state,
                       pipeline_load_k, pipeline_load_k_producer_state, pipeline_load_v,
@@ -491,6 +511,7 @@ struct Sm100FmhaFwdKernelTmaWarpspecialized {
         epilogue.store(blk_coord, logical_problem_shape, params.epilogue, params.problem_shape,
                        shared_storage.epilogue, pipeline_corr_epi,
                        pipeline_corr_epi_consumer_state);
+
         work_idx++;
       }
 
diff --git a/tests/test_blackwell_fmha.py b/tests/test_blackwell_fmha.py
@@ -244,6 +244,89 @@ def test_blackwell_cutlass_varlen(
     torch.testing.assert_close(lse, lse_ref, rtol=1e-3, atol=1e-3)
 
 
+@pytest.mark.parametrize("qo_indptr_list", [[0, 10, 20, 30, 40, 50, 60, 100]])
+@pytest.mark.parametrize("kv_indptr_list", [[0, 50, 50, 50, 50, 50, 50, 50]])
+@pytest.mark.parametrize("num_qo_heads", [32])
+@pytest.mark.parametrize("num_kv_heads", [8, 32])
+@pytest.mark.parametrize("head_dim_qk", [192, 128])
+@pytest.mark.parametrize("head_dim_vo", [128])
+@pytest.mark.parametrize("sm_scale", [1.0 / math.sqrt(128)])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+def test_blackwell_cutlass_qo_kv_varlen(
+    qo_indptr_list,
+    kv_indptr_list,
+    num_qo_heads,
+    num_kv_heads,
+    head_dim_qk,
+    head_dim_vo,
+    sm_scale,
+    dtype,
+):
+    causal = False
+    if not is_sm100a_supported(torch.device("cuda")):
+        pytest.skip("SM100A is not supported on this device")
+    torch.manual_seed(42)
+    q = torch.randn(
+        qo_indptr_list[-1],
+        num_qo_heads,
+        head_dim_qk,
+        dtype=dtype,
+        device="cuda",
+    )
+    k = torch.randn(
+        kv_indptr_list[-1],
+        num_kv_heads,
+        head_dim_qk,
+        dtype=dtype,
+        device="cuda",
+    )
+    v = torch.randn(
+        kv_indptr_list[-1],
+        num_kv_heads,
+        head_dim_vo,
+        dtype=dtype,
+        device="cuda",
+    )
+
+    qo_indptr = torch.tensor(qo_indptr_list, device="cuda", dtype=torch.int32)
+    kv_indptr = torch.tensor(kv_indptr_list, device="cuda", dtype=torch.int32)
+
+    wrapper = flashinfer.prefill.BatchPrefillWithRaggedKVCacheWrapper(
+        torch.empty(128 * 1024 * 1024, device="cuda", dtype=torch.uint8),
+        kv_layout="NHD",
+        backend="cutlass",
+    )
+
+    wrapper.plan(
+        qo_indptr,
+        kv_indptr,
+        num_qo_heads,
+        num_kv_heads,
+        head_dim_qk,
+        head_dim_vo=head_dim_vo,
+        causal=causal,
+        sm_scale=sm_scale,
+        q_data_type=dtype,
+        kv_data_type=dtype,
+    )
+    o, lse = wrapper.run(q, k, v, return_lse=True)
+
+    gqa_group_ratio = num_qo_heads // num_kv_heads
+    k_repeated = torch.repeat_interleave(k, gqa_group_ratio, dim=1)
+    v_repeated = torch.repeat_interleave(v, gqa_group_ratio, dim=1)
+
+    o_ref, lse_ref = attention_varlen_ref(
+        q, k_repeated, v_repeated, qo_indptr, kv_indptr, causal, sm_scale
+    )
+
+    if dtype == torch.half:
+        torch.testing.assert_close(o[10:60], o_ref[10:60], rtol=1e-3, atol=1e-3)
+    else:
+        torch.testing.assert_close(o[10:60], o_ref[10:60], rtol=1e-2, atol=1e-2)
+
+    torch.testing.assert_close(lse, lse_ref, rtol=1e-3, atol=1e-3)
+
+
 if __name__ == "__main__":
     test_blackwell_cutlass_fmha(
         9,
@@ -268,3 +351,14 @@ def test_blackwell_cutlass_varlen(
         True,
         torch.bfloat16,
     )
+
+    test_blackwell_cutlass_qo_kv_varlen(
+        [0, 10, 20, 30, 40, 50, 60, 100],
+        [0, 50, 50, 50, 50, 50, 50, 50],
+        32,
+        8,
+        128,
+        128,
+        1,
+        torch.bfloat16,
+    )