From 32399ae638de48f10d17699e9c03c8b66be85b36 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 6 Feb 2025 20:47:39 +0000 Subject: [PATCH 1/4] Optimize sgl_moe_align_block_size for deepseek_v3 Signed-off-by: mgoin --- csrc/moe/moe_align_sum_kernels.cu | 43 +++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index 01dac4044650..7573b5bc39a5 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -198,14 +198,13 @@ __global__ void moe_align_block_size_global_mem_kernel( } // taken from -// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a +// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957 template __global__ void sgl_moe_align_block_size_kernel( scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids, int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts, int32_t block_size, size_t numel, int32_t* cumsum) { __shared__ int32_t shared_counts[32][8]; - __shared__ int32_t local_offsets[256]; const int warp_id = threadIdx.x / 32; const int lane_id = threadIdx.x % 32; @@ -251,14 +250,22 @@ __global__ void sgl_moe_align_block_size_kernel( i += block_size) { expert_ids[i / block_size] = threadIdx.x; } - local_offsets[threadIdx.x] = cumsum[threadIdx.x]; } +} - __syncthreads(); - - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { +// taken from +// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957 +template +__global__ void sgl_moe_token_sort_kernel(scalar_t* __restrict__ topk_ids, + int32_t* sorted_token_ids, + int32_t* cumsum_buffer, + size_t numel) { + const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const size_t stride = blockDim.x * gridDim.x; + + for (size_t i = tid; i < numel; i += stride) { int32_t expert_id = topk_ids[i]; - int32_t rank_post_pad = atomicAdd(&local_offsets[expert_id], 1); + int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1); sorted_token_ids[rank_post_pad] = i; } } @@ -377,6 +384,9 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + TORCH_CHECK(num_experts == 256, + "sgl_moe_align_block_size kernel only support deepseek v3 now."); + VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] { // calc needed amount of shared mem for `tokens_cnts` and `cumsum` @@ -384,16 +394,27 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, auto options_int = torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); // torch::Tensor token_cnts_buffer = - // torch::empty({(num_experts + 1) * num_experts}, options_int); + // torch::zeros({(num_experts + 1) * num_experts}, options_int); torch::Tensor cumsum_buffer = - torch::empty({num_experts + 1}, options_int); + torch::zeros({num_experts + 1}, options_int); - auto kernel = vllm::moe::sgl_moe_align_block_size_kernel; - kernel<<<1, 1024, 0, stream>>>( + auto align_kernel = + vllm::moe::sgl_moe_align_block_size_kernel; + align_kernel<<<1, 1024, 0, stream>>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), num_experts, block_size, topk_ids.numel(), cumsum_buffer.data_ptr()); + + const int block_threads = 256; + const int num_blocks = + (topk_ids.numel() + block_threads - 1) / block_threads; + const int max_blocks = 65535; + const int actual_blocks = std::min(num_blocks, max_blocks); + auto sort_kernel = vllm::moe::sgl_moe_token_sort_kernel; + sort_kernel<<>>( + topk_ids.data_ptr(), sorted_token_ids.data_ptr(), + cumsum_buffer.data_ptr(), topk_ids.numel()); }); } From b31e36de57c6be5b4068a40a523a733e9bcf606f Mon Sep 17 00:00:00 2001 From: mgoin Date: Wed, 12 Feb 2025 20:29:25 +0000 Subject: [PATCH 2/4] Add sync after initialize Signed-off-by: mgoin --- csrc/moe/moe_align_sum_kernels.cu | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index 215a1a3a97a4..7778555004a1 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -207,16 +207,18 @@ __global__ void sgl_moe_align_block_size_kernel( __shared__ int32_t shared_counts[32][8]; const int warp_id = threadIdx.x / 32; - const int lane_id = threadIdx.x % 32; const int experts_per_warp = 8; const int my_expert_start = warp_id * experts_per_warp; + // Initialize shared_counts for this warp's experts for (int i = 0; i < experts_per_warp; ++i) { if (my_expert_start + i < num_experts) { shared_counts[warp_id][i] = 0; } } + __syncthreads(); + const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); const size_t start_idx = threadIdx.x * tokens_per_thread; @@ -229,6 +231,7 @@ __global__ void sgl_moe_align_block_size_kernel( __syncthreads(); + // Single thread computes cumulative sum and total tokens if (threadIdx.x == 0) { cumsum[0] = 0; for (int i = 1; i <= num_experts; ++i) { @@ -245,6 +248,7 @@ __global__ void sgl_moe_align_block_size_kernel( __syncthreads(); + // Assign expert IDs to blocks if (threadIdx.x < num_experts) { for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; i += block_size) { @@ -389,12 +393,9 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] { - // calc needed amount of shared mem for `tokens_cnts` and `cumsum` - // tensors + // calc needed amount of shared mem for `cumsum` tensors auto options_int = torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); - // torch::Tensor token_cnts_buffer = - // torch::zeros({(num_experts + 1) * num_experts}, options_int); torch::Tensor cumsum_buffer = torch::zeros({num_experts + 1}, options_int); From 709c2263a68b0919a02efdace445d11ee172102e Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 13 Feb 2025 17:20:39 +0000 Subject: [PATCH 3/4] Fix case Signed-off-by: mgoin --- vllm/model_executor/layers/fused_moe/fused_moe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index f14200e0288e..d0b6249e1c33 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -596,7 +596,7 @@ def moe_align_block_size( dtype=torch.int32, device=topk_ids.device) if num_experts >= 224: - if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: + if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON or num_experts != 256: moe_align_block_size_triton( topk_ids, num_experts, @@ -606,6 +606,7 @@ def moe_align_block_size( num_tokens_post_pad, ) else: + # Currently requires num_experts=256 ops.sgl_moe_align_block_size( topk_ids, num_experts, From 62802f3b3cff93b479d9312ff0fc5629db90d374 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 13 Feb 2025 13:16:13 -0500 Subject: [PATCH 4/4] Update csrc/moe/moe_align_sum_kernels.cu Co-authored-by: Tyler Michael Smith --- csrc/moe/moe_align_sum_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index 7778555004a1..d7be769458e3 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -389,7 +389,7 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, torch::Tensor num_tokens_post_pad) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); TORCH_CHECK(num_experts == 256, - "sgl_moe_align_block_size kernel only support deepseek v3 now."); + "sgl_moe_align_block_size kernel only supports deepseek v3."); VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {