Skip to content

Commit 0707337

Browse files
committed
Adjust # of threads of DeformConv2D by Compute Capability
1 parent d252925 commit 0707337

File tree

1 file changed

+24
-9
lines changed

1 file changed

+24
-9
lines changed

torchvision/csrc/cuda/DeformConv_cuda.cu

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,19 @@
8080

8181
using namespace at;
8282

83-
const unsigned int CUDA_NUM_THREADS = 1024;
8483
const int kMaxParallelImgs = 32;
8584

86-
inline unsigned int GET_BLOCKS(const unsigned int N) {
85+
inline unsigned int GET_THREADS() {
86+
if (at::cuda::getCurrentDeviceProperties()->major >= 6) {
87+
return 1024;
88+
}
89+
return 512;
90+
}
91+
92+
inline unsigned int GET_BLOCKS(const unsigned int THREADS, const unsigned int N) {
8793
unsigned int kMaxGridNum =
8894
at::cuda::getCurrentDeviceProperties()->maxGridSize[0];
89-
return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
95+
return std::min(kMaxGridNum, (N + THREADS - 1) / THREADS);
9096
}
9197

9298
template <typename scalar_t>
@@ -224,11 +230,14 @@ static void deformable_im2col(
224230
at::Tensor data_col) {
225231
int num_kernels = n_in_channels * out_h * out_w * parallel_imgs;
226232

233+
const unsigned int threads = GET_THREADS();
234+
const unsigned int blocks = GET_BLOCKS(threads, num_kernels);
235+
227236
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
228237
input.scalar_type(), "deformable_im2col_gpu", ([&] {
229238
deformable_im2col_gpu_kernel<<<
230-
GET_BLOCKS(num_kernels),
231-
CUDA_NUM_THREADS>>>(
239+
blocks,
240+
threads>>>(
232241
num_kernels,
233242
input.data_ptr<scalar_t>(),
234243
data_offset.data_ptr<scalar_t>(),
@@ -585,11 +594,14 @@ static void compute_grad_input(
585594
int num_kernels =
586595
channels * weight_h * weight_w * out_h * out_w * parallel_imgs;
587596

597+
const unsigned int threads = GET_THREADS();
598+
const unsigned int blocks = GET_BLOCKS(threads, num_kernels);
599+
588600
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
589601
columns.scalar_type(), "deformable_col2im_gpu", ([&] {
590602
deformable_col2im_gpu_kernel<<<
591-
GET_BLOCKS(num_kernels),
592-
CUDA_NUM_THREADS>>>(
603+
blocks,
604+
threads>>>(
593605
num_kernels,
594606
columns.data_ptr<scalar_t>(),
595607
offset.data_ptr<scalar_t>(),
@@ -790,11 +802,14 @@ static void compute_grad_offset_and_mask(
790802
int num_kernels =
791803
out_h * out_w * 2 * weight_h * weight_w * n_offset_grps * parallel_imgs;
792804

805+
const unsigned int threads = GET_THREADS();
806+
const unsigned int blocks = GET_BLOCKS(threads, num_kernels);
807+
793808
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
794809
columns.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
795810
deformable_col2im_coord_gpu_kernel<<<
796-
GET_BLOCKS(num_kernels),
797-
CUDA_NUM_THREADS>>>(
811+
blocks,
812+
threads>>>(
798813
num_kernels,
799814
columns.data_ptr<scalar_t>(),
800815
input.data_ptr<scalar_t>(),

0 commit comments

Comments
 (0)