Adjust # of threads of DeformConv2D by Compute Capability

Licht-T · Licht-T · commit 192b955905de · 2020-10-18T21:53:38.000+09:00
diff --git a/torchvision/csrc/cuda/DeformConv_cuda.cu b/torchvision/csrc/cuda/DeformConv_cuda.cu
@@ -82,12 +82,11 @@ using namespace at;
 
 const int kMaxParallelImgs = 32;
 
-inline unsigned int GET_THREADS(const unsigned int MAX_REGISTERS) {
-  const unsigned int CUDA_MAX_NUM_THREADS = 1024;
-  unsigned int kMaxRegsNumPerBlock =
-      at::cuda::getCurrentDeviceProperties()->regsPerBlock;
-
-  return std::min(CUDA_MAX_NUM_THREADS, kMaxRegsNumPerBlock / MAX_REGISTERS);
+inline unsigned int GET_THREADS() {
+  if (at::cuda::getCurrentDeviceProperties()->major >= 6) {
+    return 1024;
+  }
+  return 512;
 }
 
 inline unsigned int GET_BLOCKS(const unsigned int THREADS, const unsigned int N) {
@@ -231,8 +230,7 @@ static void deformable_im2col(
     at::Tensor data_col) {
   int num_kernels = n_in_channels * out_h * out_w * parallel_imgs;
 
-  const unsigned int max_registers = 60;
-  const unsigned int threads = GET_THREADS(max_registers);
+  const unsigned int threads = GET_THREADS();
   const unsigned int blocks = GET_BLOCKS(threads, num_kernels);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -596,8 +594,7 @@ static void compute_grad_input(
   int num_kernels =
       channels * weight_h * weight_w * out_h * out_w * parallel_imgs;
 
-  const unsigned int max_registers = 46;
-  const unsigned int threads = GET_THREADS(max_registers);
+  const unsigned int threads = GET_THREADS();
   const unsigned int blocks = GET_BLOCKS(threads, num_kernels);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -805,8 +802,7 @@ static void compute_grad_offset_and_mask(
   int num_kernels =
       out_h * out_w * 2 * weight_h * weight_w * n_offset_grps * parallel_imgs;
 
-  const unsigned int max_registers = 63;
-  const unsigned int threads = GET_THREADS(max_registers);
+  const unsigned int threads = GET_THREADS();
   const unsigned int blocks = GET_BLOCKS(threads, num_kernels);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(