|
80 | 80 |
|
81 | 81 | using namespace at;
|
82 | 82 |
|
83 |
| -const unsigned int CUDA_NUM_THREADS = 1024; |
84 | 83 | const int kMaxParallelImgs = 32;
|
85 | 84 |
|
86 |
| -inline unsigned int GET_BLOCKS(const unsigned int N) { |
| 85 | +inline unsigned int GET_THREADS() { |
| 86 | + if (at::cuda::getCurrentDeviceProperties()->major >= 6) { |
| 87 | + return 1024; |
| 88 | + } |
| 89 | + return 512; |
| 90 | +} |
| 91 | + |
| 92 | +inline unsigned int GET_BLOCKS(const unsigned int THREADS, const unsigned int N) { |
87 | 93 | unsigned int kMaxGridNum =
|
88 | 94 | at::cuda::getCurrentDeviceProperties()->maxGridSize[0];
|
89 |
| - return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS); |
| 95 | + return std::min(kMaxGridNum, (N + THREADS - 1) / THREADS); |
90 | 96 | }
|
91 | 97 |
|
92 | 98 | template <typename scalar_t>
|
@@ -224,11 +230,14 @@ static void deformable_im2col(
|
224 | 230 | at::Tensor data_col) {
|
225 | 231 | int num_kernels = n_in_channels * out_h * out_w * parallel_imgs;
|
226 | 232 |
|
| 233 | + const unsigned int threads = GET_THREADS(); |
| 234 | + const unsigned int blocks = GET_BLOCKS(threads, num_kernels); |
| 235 | + |
227 | 236 | AT_DISPATCH_FLOATING_TYPES_AND_HALF(
|
228 | 237 | input.scalar_type(), "deformable_im2col_gpu", ([&] {
|
229 | 238 | deformable_im2col_gpu_kernel<<<
|
230 |
| - GET_BLOCKS(num_kernels), |
231 |
| - CUDA_NUM_THREADS>>>( |
| 239 | + blocks, |
| 240 | + threads>>>( |
232 | 241 | num_kernels,
|
233 | 242 | input.data_ptr<scalar_t>(),
|
234 | 243 | data_offset.data_ptr<scalar_t>(),
|
@@ -585,11 +594,14 @@ static void compute_grad_input(
|
585 | 594 | int num_kernels =
|
586 | 595 | channels * weight_h * weight_w * out_h * out_w * parallel_imgs;
|
587 | 596 |
|
| 597 | + const unsigned int threads = GET_THREADS(); |
| 598 | + const unsigned int blocks = GET_BLOCKS(threads, num_kernels); |
| 599 | + |
588 | 600 | AT_DISPATCH_FLOATING_TYPES_AND_HALF(
|
589 | 601 | columns.scalar_type(), "deformable_col2im_gpu", ([&] {
|
590 | 602 | deformable_col2im_gpu_kernel<<<
|
591 |
| - GET_BLOCKS(num_kernels), |
592 |
| - CUDA_NUM_THREADS>>>( |
| 603 | + blocks, |
| 604 | + threads>>>( |
593 | 605 | num_kernels,
|
594 | 606 | columns.data_ptr<scalar_t>(),
|
595 | 607 | offset.data_ptr<scalar_t>(),
|
@@ -790,11 +802,14 @@ static void compute_grad_offset_and_mask(
|
790 | 802 | int num_kernels =
|
791 | 803 | out_h * out_w * 2 * weight_h * weight_w * n_offset_grps * parallel_imgs;
|
792 | 804 |
|
| 805 | + const unsigned int threads = GET_THREADS(); |
| 806 | + const unsigned int blocks = GET_BLOCKS(threads, num_kernels); |
| 807 | + |
793 | 808 | AT_DISPATCH_FLOATING_TYPES_AND_HALF(
|
794 | 809 | columns.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
|
795 | 810 | deformable_col2im_coord_gpu_kernel<<<
|
796 |
| - GET_BLOCKS(num_kernels), |
797 |
| - CUDA_NUM_THREADS>>>( |
| 811 | + blocks, |
| 812 | + threads>>>( |
798 | 813 | num_kernels,
|
799 | 814 | columns.data_ptr<scalar_t>(),
|
800 | 815 | input.data_ptr<scalar_t>(),
|
|
0 commit comments