Skip to content

Commit 04c7465

Browse files
committed
code cleanup
Signed-off-by: Amey naik <[email protected]>
1 parent c475ea0 commit 04c7465

File tree

4 files changed

+2
-8
lines changed

4 files changed

+2
-8
lines changed

cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -691,11 +691,6 @@ void allreduce_fusion_kernel_launcher(AllReduceFusionParams const& params)
691691
TLLM_CHECK(oneshot || threads_per_block >= params.nranks);
692692
int block_size = threads_per_block;
693693

694-
// // Override block size to 1024 for AllGather operations
695-
// if (params.pattern == AllReduceFusionPattern::kAllGather) {
696-
// block_size = 1024;
697-
// }
698-
699694
TLLM_CHECK(block_size <= 1024 && cluster_size > 0);
700695

701696
int grid_size = (std::min(sm_count, cluster_num * cluster_size) / cluster_size) * cluster_size;

cpp/tensorrt_llm/nanobind/runtime/bindings.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,7 @@ void initBindings(nb::module_& m)
345345
.def("get_mc_buffer", &tensorrt_llm::runtime::McastGPUBuffer::getMCBuffer);
346346

347347
nb::enum_<tensorrt_llm::kernels::AllReduceFusionOp>(m, "AllReduceFusionOp")
348+
.value("ALLGATHER", tensorrt_llm::kernels::AllReduceFusionOp::ALLGATHER)
348349
.value("NONE", tensorrt_llm::kernels::AllReduceFusionOp::NONE)
349350
.value("RESIDUAL_RMS_NORM", tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM)
350351
.value("LAST_PROCESS_FOR_UB", tensorrt_llm::kernels::AllReduceFusionOp::LAST_PROCESS_FOR_UB)

tensorrt_llm/_torch/speculative/mtp.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1069,6 +1069,7 @@ def get_local_max_and_combined(self, logits):
10691069
original_last_dim = combined.shape[-1]
10701070

10711071
# Ensure the combined tensor has at least 4 elements by padding with zeros
1072+
# This is required by the Lamport ALLGATHER kernel implementation
10721073
if combined.numel() < 4:
10731074
padding_size = 4 - combined.numel()
10741075
# Create padding tensor with same shape as combined except for the last dimension

tests/unittest/_torch/multi_gpu/test_allreduce.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,6 @@ def e2m1_and_ufp8sf_scale_to_float_v2(e2m1_tensor,
106106
e2m1_tensor, ufp8_scale_tensor, global_scale_tensor, sf_vec_size,
107107
ufp8_type, is_sf_swizzled_layout)
108108

109-
print(f"DBG AMEY: run_allreduce_op: i am here")
110109
x = x.cuda()
111110
residual = residual.cuda()
112111
norm_weight = torch.randn((hidden_size, ), dtype=dtype, device="cuda")
@@ -383,14 +382,12 @@ def ref_allgather(x, res):
383382
def test_allreduce_fusion_patterns(seq_len, hidden_size, fusion_op,
384383
mpi_pool_executor):
385384
torch.manual_seed(0)
386-
print("DBG AMEY: test_allreduce_fusion_patterns: i am here")
387385
# dtype = torch.bfloat16
388386
dtype = torch.float32
389387
tensor_parallel_size = mpi_pool_executor.num_workers
390388
x = torch.randn((seq_len, hidden_size), dtype=dtype)
391389
residual = torch.randn_like(x)
392390
linear_weight = torch.randn((hidden_size, hidden_size), dtype=dtype)
393-
print(f"DBG AMEY: test_allreduce_fusion_patterns: seq_len={seq_len}, hidden_size={hidden_size}, fusion_op={fusion_op}")
394391
results = mpi_pool_executor.map(
395392
run_single_rank,
396393
*zip(*[(tensor_parallel_size, run_allreduce_op, x, residual,

0 commit comments

Comments
 (0)