NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu‎
Lines changed: 1 addition & 1 deletion
@@ -520,7 +520,7 @@ __global__ void __launch_bounds__(1024) allreduce_fusion_kernel_oneshot_lamport(
 }
 
 template <AllReduceFusionPattern Pattern, typename DType, int NRanks, bool Fp32Acc>
-__global__ void allreduce_fusion_kernel_twoshot_sync(
+__global__ void __launch_bounds__(1024) allreduce_fusion_kernel_twoshot_sync(
     AllReduceFusionParams params, std::array<int, NRanks> begin_tokens, std::array<int, NRanks> token_num_per_ranks)
 {
     IndexHelper<DType> index_helper(params);
Original file line number	Diff line number	Diff line change
`@@ -520,7 +520,7 @@ __global__ void __launch_bounds__(1024) allreduce_fusion_kernel_oneshot_lamport(`
`520`	`520`	`}`
`521`	`521`
`522`	`522`	`template <AllReduceFusionPattern Pattern, typename DType, int NRanks, bool Fp32Acc>`
`523`		`-__global__ void allreduce_fusion_kernel_twoshot_sync(`
	`523`	`+__global__ void __launch_bounds__(1024) allreduce_fusion_kernel_twoshot_sync(`
`524`	`524`	`AllReduceFusionParams params, std::array<int, NRanks> begin_tokens, std::array<int, NRanks> token_num_per_ranks)`
`525`	`525`	`{`
`526`	`526`	`IndexHelper<DType> index_helper(params);`