[ROCm] use ncclAllToAll for rocm

KyleCZH · pruthvistony · commit ed9b1608ff42 · 2022-04-27T19:29:36.000-07:00
use ncclAllToAll for rocm version > 5.0; ROCm/rccl#503 detail on ncclAllToAll: ROCm/rccl#503 @jithunnair-amd @amathews-amd Pull Request resolved: pytorch#75128 Approved by: https://github.com/wenkaidu, https://github.com/yzygitzh, https://github.com/seemethere
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
@@ -652,6 +652,9 @@ void all2all_single_equal_split(at::Tensor& input,
   const auto* sendbuff = reinterpret_cast<char*>(input.data_ptr());
   auto* recvbuff = reinterpret_cast<char *>(output.data_ptr());
   auto comm = to_nccl_comm(_comm);
+#if defined(USE_ROCM) && ROCM_VERSION >= 50000
+  NCCL_CHECK(ncclAllToAll(sendbuff , recvbuff , count,  type, comm, stream));
+#else
   NCCL_CHECK(ncclCommCount(comm, &numranks));
   NCCL_CHECK(ncclGroupStart());
   for(const auto r : c10::irange(numranks)) {
@@ -663,6 +666,7 @@ void all2all_single_equal_split(at::Tensor& input,
     }
   }
   NCCL_CHECK(ncclGroupEnd());
+#endif
 #else
   AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0");
 #endif