Skip to content

Commit 9e8016d

Browse files
janeyx99facebook-github-bot
authored andcommitted
Revert D31932215: [pytorch][PR] Don't #define NUM_THREADS
Test Plan: revert-hammer Differential Revision: D31932215 (pytorch@f70e806) Original commit changeset: ccdf11e249fb fbshipit-source-id: 4c330aebe9cfb483f02ceb1fdaf5c3b0f8fa6fa1
1 parent 10411e3 commit 9e8016d

11 files changed

+92
-90
lines changed

aten/src/ATen/native/cuda/CUDALoops.cuh

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,12 @@
5959
namespace at { namespace native {
6060

6161
template<int vec_size, typename func_t, typename array_t>
62-
C10_LAUNCH_BOUNDS_1(num_threads())
62+
C10_LAUNCH_BOUNDS_1(num_threads)
6363
__global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
6464
using traits = function_traits<func_t>;
65-
int remaining = N - block_work_size() * blockIdx.x;
65+
int remaining = N - block_work_size * blockIdx.x;
6666

67-
if (remaining < block_work_size()) { // if this block handles the reminder, just do a naive unrolled loop
67+
if (remaining < block_work_size) { // if this block handles the reminder, just do a naive unrolled loop
6868
auto input_calc = TrivialOffsetCalculator<traits::arity>();
6969
auto output_calc = TrivialOffsetCalculator<1>();
7070
auto loader = memory::LoadWithoutCast();
@@ -79,11 +79,11 @@ __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
7979
}
8080

8181
template<typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t, typename loader_t, typename storer_t>
82-
C10_LAUNCH_BOUNDS_1(num_threads())
82+
C10_LAUNCH_BOUNDS_1(num_threads)
8383
__global__ void unrolled_elementwise_kernel(int N, func_t f, array_t data,
8484
inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s)
8585
{
86-
int remaining = N - block_work_size() * blockIdx.x;
86+
int remaining = N - block_work_size * blockIdx.x;
8787
auto policy = memory::policies::unroll<array_t, inp_calc_t, out_calc_t, loader_t, storer_t>(data, remaining, ic, oc, l, s);
8888
elementwise_kernel_helper(f, policy);
8989
}
@@ -93,25 +93,25 @@ template<typename func_t, typename array_t>
9393
static inline void launch_vectorized_kernel(int64_t N, const func_t& f, array_t data) {
9494
TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
9595
using traits = function_traits<func_t>;
96-
int64_t grid = (N + block_work_size() - 1) / block_work_size();
96+
int64_t grid = (N + block_work_size - 1) / block_work_size;
9797
auto stream = at::cuda::getCurrentCUDAStream();
9898
int vec_size = memory::can_vectorize_up_to<func_t>(data);
9999

100100
switch (vec_size) {
101101
case 4:
102-
vectorized_elementwise_kernel<4, func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data);
102+
vectorized_elementwise_kernel<4, func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data);
103103
C10_CUDA_KERNEL_LAUNCH_CHECK();
104104
break;
105105
case 2:
106-
vectorized_elementwise_kernel<2, func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data);
106+
vectorized_elementwise_kernel<2, func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data);
107107
C10_CUDA_KERNEL_LAUNCH_CHECK();
108108
break;
109109
case 1: {
110110
auto input_calc = TrivialOffsetCalculator<traits::arity>();
111111
auto output_calc = TrivialOffsetCalculator<1>();
112112
auto loader = memory::LoadWithoutCast();
113113
auto storer = memory::StoreWithoutCast();
114-
unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data, input_calc, output_calc, loader, storer);
114+
unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data, input_calc, output_calc, loader, storer);
115115
C10_CUDA_KERNEL_LAUNCH_CHECK();
116116
break;
117117
}
@@ -125,9 +125,9 @@ static inline void launch_unrolled_kernel(int64_t N, const func_t& f, array_t da
125125
inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s)
126126
{
127127
TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
128-
int64_t grid = (N + block_work_size() - 1) / block_work_size();
128+
int64_t grid = (N + block_work_size - 1) / block_work_size;
129129
auto stream = at::cuda::getCurrentCUDAStream();
130-
unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data, ic, oc, l, s);
130+
unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data, ic, oc, l, s);
131131
C10_CUDA_KERNEL_LAUNCH_CHECK();
132132
}
133133

aten/src/ATen/native/cuda/CrossKernel.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ void launch_cross_kernel(const TensorIteratorBase& iter, int64_t ostride,
3636
const auto N = iter.numel();
3737
auto offset_calculator = make_element_offset_calculator<3>(iter);
3838
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(N > 0 && N <= std::numeric_limits<int32_t>::max());
39-
int64_t grid = (N + num_threads() - 1) / num_threads();
39+
int64_t grid = (N + NUM_THREADS - 1) / NUM_THREADS;
4040
auto stream = at::cuda::getCurrentCUDAStream();
4141

4242
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kHalf, iter.common_dtype(), "cross_cuda", [&] {
@@ -45,11 +45,11 @@ void launch_cross_kernel(const TensorIteratorBase& iter, int64_t ostride,
4545
auto x2 = static_cast<const scalar_t*>(iter.data_ptr(2));
4646
constexpr int64_t int_max = std::numeric_limits<int>::max();
4747
if (ostride * 2 > int_max || x1stride * 2 > int_max || x2stride * 2 > int_max) {
48-
cross_kernel<<<grid, num_threads(), 0, stream>>>(
48+
cross_kernel<<<grid, num_threads, 0, stream>>>(
4949
N, out, x1, x2, offset_calculator, ostride, x1stride, x2stride);
5050
C10_CUDA_KERNEL_LAUNCH_CHECK();
5151
} else {
52-
cross_kernel<<<grid, num_threads(), 0, stream>>>(
52+
cross_kernel<<<grid, num_threads, 0, stream>>>(
5353
N, out, x1, x2, offset_calculator,
5454
static_cast<int>(ostride),
5555
static_cast<int>(x1stride),

aten/src/ATen/native/cuda/DistributionTemplates.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -188,11 +188,11 @@ __global__ void distribution_binary_elementwise_kernel(
188188
using input_t_1 = typename function_traits<func_t>::template arg<1>::type;
189189
using input_t_2 = typename function_traits<func_t>::template arg<2>::type;
190190

191-
input_t_1 inputs_1[thread_work_size()];
192-
input_t_2 inputs_2[thread_work_size()];
191+
input_t_1 inputs_1[thread_work_size];
192+
input_t_2 inputs_2[thread_work_size];
193193

194-
int base_index = block_work_size() * blockIdx.x;
195-
int remaining = std::min<int>(numel - base_index, block_work_size());
194+
int base_index = BLOCK_WORK_SIZE * blockIdx.x;
195+
int remaining = std::min<int>(numel - base_index, BLOCK_WORK_SIZE);
196196

197197
curandStatePhilox4_32_10_t state;
198198
curand_init(std::get<0>(seeds),
@@ -203,7 +203,7 @@ __global__ void distribution_binary_elementwise_kernel(
203203
// load data into registers
204204
int thread_idx = threadIdx.x;
205205
#pragma unroll
206-
for (int i = 0; i < thread_work_size(); i++) {
206+
for (int i = 0; i < thread_work_size; i++) {
207207
if (thread_idx >= remaining) {
208208
break;
209209
}
@@ -212,20 +212,20 @@ __global__ void distribution_binary_elementwise_kernel(
212212
inputs_1[i] = input_data_1[offsets[0]];
213213
inputs_2[i] = input_data_2[offsets[1]];
214214

215-
thread_idx += num_threads();
215+
thread_idx += num_threads;
216216
}
217217

218218
// compute and store
219219
thread_idx = threadIdx.x;
220220
#pragma unroll
221-
for (int i = 0; i < thread_work_size(); i++) {
221+
for (int i = 0; i < thread_work_size; i++) {
222222
if (thread_idx >= remaining) {
223223
break;
224224
}
225225
int input_idx = thread_idx + base_index;
226226
auto offsets = out_calc.get(input_idx);
227227
output_data[offsets[0]] = f(state, inputs_1[i], inputs_2[i]);
228-
thread_idx += num_threads();
228+
thread_idx += num_threads;
229229
}
230230
}
231231

@@ -254,16 +254,16 @@ void distribution_binary_kernel(TensorIterator &iter, PhiloxCudaState philox_arg
254254
const input_t_1 *input_data_1 = static_cast<const input_t_1 *>(iter.data_ptr(1));
255255
const input_t_2 *input_data_2 = static_cast<const input_t_2 *>(iter.data_ptr(2));
256256

257-
int64_t grid = (numel + block_work_size() - 1) / block_work_size();
257+
int64_t grid = (numel + block_work_size - 1) / block_work_size;
258258
auto stream = at::cuda::getCurrentCUDAStream();
259259

260260
if (iter.is_contiguous()) {
261-
distribution_binary_elementwise_kernel<<<grid,num_threads(), 0, stream>>>(
261+
distribution_binary_elementwise_kernel<<<grid,num_threads, 0, stream>>>(
262262
numel, f, philox_args, output_data, input_data_1, input_data_2,
263263
TrivialOffsetCalculator<2>(), TrivialOffsetCalculator<1>());
264264
C10_CUDA_KERNEL_LAUNCH_CHECK();
265265
} else {
266-
distribution_binary_elementwise_kernel<<<grid, num_threads(), 0, stream>>>(
266+
distribution_binary_elementwise_kernel<<<grid, num_threads, 0, stream>>>(
267267
numel, f, philox_args, output_data, input_data_1, input_data_2,
268268
make_input_offset_calculator<2>(iter), make_output_offset_calculator(iter));
269269
C10_CUDA_KERNEL_LAUNCH_CHECK();

aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ void _compute_linear_combination_internal_kernel(
8585
}
8686
};
8787

88-
_lauch_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
88+
_lauch_kernel<num_threads, thread_work_size>(iter.numel(), loop);
8989
}
9090

9191
void _compute_linear_combination_cuda_kernel(

aten/src/ATen/native/cuda/LinearAlgebra.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ void _unpack_pivots_internal_kernel(
137137
}
138138
};
139139

140-
_launch_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
140+
_launch_kernel<num_threads, thread_work_size>(iter.numel(), loop);
141141
}
142142

143143
void unpack_pivots_cuda_kernel(

aten/src/ATen/native/cuda/Loops.cuh

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,13 @@
99

1010
#include <thrust/tuple.h>
1111

12-
constexpr int num_threads() { return C10_WARP_SIZE * 4; }
13-
constexpr int thread_work_size() { return 4; }
14-
constexpr int block_work_size() { return thread_work_size() * num_threads(); }
12+
#define NUM_THREADS (C10_WARP_SIZE * 2)
13+
#define THREAD_WORK_SIZE 4
14+
#define BLOCK_WORK_SIZE (THREAD_WORK_SIZE * num_threads)
15+
16+
constexpr int num_threads = NUM_THREADS;
17+
constexpr int thread_work_size = THREAD_WORK_SIZE;
18+
constexpr int block_work_size = BLOCK_WORK_SIZE;
1519

1620
#include <ATen/native/cuda/MemoryAccess.cuh>
1721

@@ -51,15 +55,15 @@ __device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) {
5155

5256
int idx = blockIdx.x;
5357

54-
return_t results[thread_work_size()];
55-
args_t args[thread_work_size()];
58+
return_t results[thread_work_size];
59+
args_t args[thread_work_size];
5660

5761
// load
5862
policy.load(args, idx);
5963

6064
// compute
6165
#pragma unroll
62-
for (int i = 0; i < thread_work_size(); i++) {
66+
for (int i = 0; i < thread_work_size; i++) {
6367
if (policy.check_inbounds(i)) {
6468
results[i] = c10::guts::apply(f, args[i]);
6569
}
@@ -205,18 +209,18 @@ template <typename T> struct is_tuple: std::false_type {};
205209
template <typename ...T> struct is_tuple<thrust::tuple<T...>>: std::true_type {};
206210

207211
template <int num_outputs, typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t>
208-
C10_LAUNCH_BOUNDS_1(num_threads())
212+
C10_LAUNCH_BOUNDS_1(num_threads)
209213
__global__ void unrolled_elementwise_kernel_for_multi_outputs(int N, func_t f, array_t data, inp_calc_t ic, out_calc_t oc) {
210-
int remaining = N - block_work_size() * blockIdx.x;
214+
int remaining = N - block_work_size * blockIdx.x;
211215
elementwise_kernel_helper(f, memory::policies::multi_outputs_unroll<array_t, inp_calc_t, out_calc_t, num_outputs>(data, remaining, ic, oc));
212216
}
213217

214218
template <int num_outputs, typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t>
215219
static inline void launch_unrolled_kernel_for_multi_outputs(int64_t N, const func_t& f, array_t data, inp_calc_t ic, out_calc_t oc) {
216220
TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
217-
int64_t grid = (N + block_work_size() - 1) / block_work_size();
221+
int64_t grid = (N + block_work_size - 1) / block_work_size;
218222
auto stream = at::cuda::getCurrentCUDAStream();
219-
unrolled_elementwise_kernel_for_multi_outputs<num_outputs, func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data, ic, oc);
223+
unrolled_elementwise_kernel_for_multi_outputs<num_outputs, func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data, ic, oc);
220224
C10_CUDA_KERNEL_LAUNCH_CHECK();
221225
}
222226

aten/src/ATen/native/cuda/MemoryAccess.cuh

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ struct vectorized_load_helper {
5959
using arg_t = std::tuple_element_t<arg_index, args_t>;
6060
// `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
6161
// need a +1 offset to get the input
62-
auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + block_work_size() * idx;
62+
auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + block_work_size * idx;
6363
auto args_accessor = [&args] __device__ (int thread_unroll_idx) -> arg_t & { return std::get<arg_index>(args[thread_unroll_idx]); };
6464
self.load_single_arg(args_accessor, ptr);
6565
}
@@ -164,38 +164,38 @@ struct unroll {
164164
data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc), loader(l), storer(s) {}
165165

166166
__device__ inline bool check_inbounds(int thread_work_elem) {
167-
return ((threadIdx.x + thread_work_elem*num_threads()) < remaining);
167+
return ((threadIdx.x + thread_work_elem*num_threads) < remaining);
168168
}
169169

170170
template<typename args_t>
171171
__device__ inline void load(args_t *args, int idx) {
172172
constexpr int arity = std::tuple_size<args_t>::value;
173173
int thread_idx = threadIdx.x;
174174
#pragma unroll
175-
for (int i = 0; i < thread_work_size(); i++) {
175+
for (int i = 0; i < thread_work_size; i++) {
176176
if (thread_idx >= remaining) {
177177
return;
178178
}
179-
int linear_idx = thread_idx + block_work_size() * idx;
179+
int linear_idx = thread_idx + block_work_size * idx;
180180
auto offset = input_offset_calculator.get(linear_idx);
181181
detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
182-
thread_idx += num_threads();
182+
thread_idx += num_threads;
183183
}
184184
}
185185

186186
template<typename scalar_t>
187187
__device__ inline void store(scalar_t *from, int idx) {
188188
int thread_idx = threadIdx.x;
189-
scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size() * idx;
189+
scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size * idx;
190190
#pragma unroll
191-
for (int i = 0; i < thread_work_size(); i++) {
191+
for (int i = 0; i < thread_work_size; i++) {
192192
if (thread_idx >= remaining) {
193193
return;
194194
}
195-
int linear_idx = thread_idx + block_work_size() * idx;
195+
int linear_idx = thread_idx + block_work_size * idx;
196196
int offset = output_offset_calculator.get(linear_idx)[0];
197197
storer.store(from[i], data[0], offset);
198-
thread_idx += num_threads();
198+
thread_idx += num_threads;
199199
}
200200
}
201201
};
@@ -208,8 +208,8 @@ struct unroll {
208208
template <int vec_size, typename data_t> // vec_size: number of scalars, can be 1, 2, or 4.
209209
struct vectorized {
210210

211-
static_assert(thread_work_size() % vec_size == 0, "The workload per thread must be a multiple of vec_size");
212-
static constexpr int loop_size = thread_work_size() / vec_size;
211+
static_assert(thread_work_size % vec_size == 0, "The workload per thread must be a multiple of vec_size");
212+
static constexpr int loop_size = thread_work_size / vec_size;
213213

214214
data_t data;
215215

@@ -226,7 +226,7 @@ struct vectorized {
226226
int thread_idx = threadIdx.x;
227227
#pragma unroll
228228
for (int i = 0; i < loop_size; i++) {
229-
int index = thread_idx + i * num_threads();
229+
int index = thread_idx + i * num_threads;
230230
vec_t v = from_[index];
231231
#pragma unroll
232232
for (int j = 0; j < vec_size; j++) {
@@ -244,12 +244,12 @@ struct vectorized {
244244
template<typename scalar_t>
245245
__device__ inline void store(scalar_t *from, int idx) {
246246
using vec_t = aligned_vector<scalar_t, vec_size>;
247-
scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size() * idx;
247+
scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size * idx;
248248
vec_t *to_ = reinterpret_cast<vec_t *>(to);
249249
int thread_idx = threadIdx.x;
250250
#pragma unroll
251251
for (int i = 0; i < loop_size; i++) {
252-
int index = thread_idx + i * num_threads();
252+
int index = thread_idx + i * num_threads;
253253
vec_t v;
254254
for (int j = 0; j < vec_size; j++) {
255255
v.val[j] = from[vec_size * i + j];
@@ -274,22 +274,22 @@ struct multi_outputs_unroll {
274274
data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {}
275275

276276
__device__ inline bool check_inbounds(int thread_work_elem) {
277-
return ((threadIdx.x + thread_work_elem*num_threads()) < remaining);
277+
return ((threadIdx.x + thread_work_elem*num_threads) < remaining);
278278
}
279279

280280
template<typename args_t>
281281
__device__ inline void load(args_t *args, int idx) {
282282
constexpr int arity = std::tuple_size<args_t>::value;
283283
int thread_idx = threadIdx.x;
284284
#pragma unroll
285-
for (int i = 0; i < thread_work_size(); i++) {
285+
for (int i = 0; i < thread_work_size; i++) {
286286
if (thread_idx >= remaining) {
287287
return;
288288
}
289-
int linear_idx = thread_idx + block_work_size() * idx;
289+
int linear_idx = thread_idx + block_work_size * idx;
290290
auto offset = input_offset_calculator.get(linear_idx);
291291
detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
292-
thread_idx += num_threads();
292+
thread_idx += num_threads;
293293
}
294294
}
295295

@@ -298,14 +298,14 @@ struct multi_outputs_unroll {
298298
__device__ inline void store(return_t *from, int idx) {
299299
int thread_idx = threadIdx.x;
300300
#pragma unroll
301-
for (int i = 0; i < thread_work_size(); i++) {
301+
for (int i = 0; i < thread_work_size; i++) {
302302
if (thread_idx >= this->remaining) {
303303
return;
304304
}
305-
int linear_idx = thread_idx + block_work_size() * idx;
305+
int linear_idx = thread_idx + block_work_size * idx;
306306
auto offsets = this->output_offset_calculator.get(linear_idx);
307307
memory::detail::static_unroll<detail::multi_outputs_store_helper, num_outputs>::with_args(this->data, offsets, from[i]);
308-
thread_idx += num_threads();
308+
thread_idx += num_threads;
309309
}
310310
}
311311
};

0 commit comments

Comments
 (0)