WBobby
diff --git a/‎aten/src/ATen/native/cuda/CUDALoops.cuh
Lines changed: 11 additions & 11 deletions b/‎aten/src/ATen/native/cuda/CUDALoops.cuh
Lines changed: 11 additions & 11 deletions
diff --git a/‎aten/src/ATen/native/cuda/CrossKernel.cu
Lines changed: 3 additions & 3 deletions b/‎aten/src/ATen/native/cuda/CrossKernel.cu
Lines changed: 3 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/cuda/DistributionTemplates.h
Lines changed: 11 additions & 11 deletions b/‎aten/src/ATen/native/cuda/DistributionTemplates.h
Lines changed: 11 additions & 11 deletions
diff --git a/‎aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/cuda/LinearAlgebra.cu
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/cuda/LinearAlgebra.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/cuda/Loops.cuh
Lines changed: 14 additions & 10 deletions b/‎aten/src/ATen/native/cuda/Loops.cuh
Lines changed: 14 additions & 10 deletions
diff --git a/‎aten/src/ATen/native/cuda/MemoryAccess.cuh
Lines changed: 21 additions & 21 deletions b/‎aten/src/ATen/native/cuda/MemoryAccess.cuh
Lines changed: 21 additions & 21 deletions
@@ -59,12 +59,12 @@
 namespace at { namespace native {
 
 template<int vec_size, typename func_t, typename array_t>
-C10_LAUNCH_BOUNDS_1(num_threads())
+C10_LAUNCH_BOUNDS_1(num_threads)
 __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
   using traits = function_traits<func_t>;
-  int remaining = N - block_work_size() * blockIdx.x;
+  int remaining = N - block_work_size * blockIdx.x;
 
-  if (remaining < block_work_size()) {  // if this block handles the reminder, just do a naive unrolled loop
+  if (remaining < block_work_size) {  // if this block handles the reminder, just do a naive unrolled loop
     auto input_calc = TrivialOffsetCalculator<traits::arity>();
     auto output_calc = TrivialOffsetCalculator<1>();
     auto loader = memory::LoadWithoutCast();
@@ -79,11 +79,11 @@ __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
 }
 
 template<typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t, typename loader_t, typename storer_t>
-C10_LAUNCH_BOUNDS_1(num_threads())
+C10_LAUNCH_BOUNDS_1(num_threads)
 __global__ void unrolled_elementwise_kernel(int N, func_t f, array_t data,
                                             inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s)
 {
-  int remaining = N - block_work_size() * blockIdx.x;
+  int remaining = N - block_work_size * blockIdx.x;
   auto policy = memory::policies::unroll<array_t, inp_calc_t, out_calc_t, loader_t, storer_t>(data, remaining, ic, oc, l, s);
   elementwise_kernel_helper(f, policy);
 }
@@ -93,25 +93,25 @@ template<typename func_t, typename array_t>
 static inline void launch_vectorized_kernel(int64_t N, const func_t& f, array_t data) {
   TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
   using traits = function_traits<func_t>;
-  int64_t grid = (N + block_work_size() - 1) / block_work_size();
+  int64_t grid = (N + block_work_size - 1) / block_work_size;
   auto stream = at::cuda::getCurrentCUDAStream();
   int vec_size = memory::can_vectorize_up_to<func_t>(data);
 
   switch (vec_size) {
   case 4:
-    vectorized_elementwise_kernel<4, func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data);
+    vectorized_elementwise_kernel<4, func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
     break;
   case 2:
-    vectorized_elementwise_kernel<2, func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data);
+    vectorized_elementwise_kernel<2, func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
     break;
   case 1: {
     auto input_calc = TrivialOffsetCalculator<traits::arity>();
     auto output_calc = TrivialOffsetCalculator<1>();
     auto loader = memory::LoadWithoutCast();
     auto storer = memory::StoreWithoutCast();
-    unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data, input_calc, output_calc, loader, storer);
+    unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data, input_calc, output_calc, loader, storer);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
     break;
   }
@@ -125,9 +125,9 @@ static inline void launch_unrolled_kernel(int64_t N, const func_t& f, array_t da
                                           inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s)
 {
   TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
-  int64_t grid = (N + block_work_size() - 1) / block_work_size();
+  int64_t grid = (N + block_work_size - 1) / block_work_size;
   auto stream = at::cuda::getCurrentCUDAStream();
-  unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data, ic, oc, l, s);
+  unrolled_elementwise_kernel<func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data, ic, oc, l, s);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 
@@ -36,7 +36,7 @@ void launch_cross_kernel(const TensorIteratorBase& iter, int64_t ostride,
   const auto N = iter.numel();
   auto offset_calculator = make_element_offset_calculator<3>(iter);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(N > 0 && N <= std::numeric_limits<int32_t>::max());
-  int64_t grid = (N + num_threads() - 1) / num_threads();
+  int64_t grid = (N + NUM_THREADS - 1) / NUM_THREADS;
   auto stream = at::cuda::getCurrentCUDAStream();
 
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kHalf, iter.common_dtype(), "cross_cuda", [&] {
@@ -45,11 +45,11 @@ void launch_cross_kernel(const TensorIteratorBase& iter, int64_t ostride,
     auto x2 = static_cast<const scalar_t*>(iter.data_ptr(2));
     constexpr int64_t int_max = std::numeric_limits<int>::max();
     if (ostride * 2 > int_max || x1stride * 2 > int_max || x2stride * 2 > int_max) {
-      cross_kernel<<<grid, num_threads(), 0, stream>>>(
+      cross_kernel<<<grid, num_threads, 0, stream>>>(
           N, out, x1, x2, offset_calculator, ostride, x1stride, x2stride);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
-      cross_kernel<<<grid, num_threads(), 0, stream>>>(
+      cross_kernel<<<grid, num_threads, 0, stream>>>(
           N, out, x1, x2, offset_calculator,
           static_cast<int>(ostride),
           static_cast<int>(x1stride),
 
@@ -188,11 +188,11 @@ __global__ void distribution_binary_elementwise_kernel(
   using input_t_1 = typename function_traits<func_t>::template arg<1>::type;
   using input_t_2 = typename function_traits<func_t>::template arg<2>::type;
 
-  input_t_1 inputs_1[thread_work_size()];
-  input_t_2 inputs_2[thread_work_size()];
+  input_t_1 inputs_1[thread_work_size];
+  input_t_2 inputs_2[thread_work_size];
 
-  int base_index = block_work_size() * blockIdx.x;
-  int remaining = std::min<int>(numel - base_index, block_work_size());
+  int base_index = BLOCK_WORK_SIZE * blockIdx.x;
+  int remaining = std::min<int>(numel - base_index, BLOCK_WORK_SIZE);
 
   curandStatePhilox4_32_10_t state;
   curand_init(std::get<0>(seeds),
@@ -203,7 +203,7 @@ __global__ void distribution_binary_elementwise_kernel(
   // load data into registers
   int thread_idx = threadIdx.x;
   #pragma unroll
-  for (int i = 0; i < thread_work_size(); i++) {
+  for (int i = 0; i < thread_work_size; i++) {
     if (thread_idx >= remaining) {
       break;
     }
@@ -212,20 +212,20 @@ __global__ void distribution_binary_elementwise_kernel(
     inputs_1[i] = input_data_1[offsets[0]];
     inputs_2[i] = input_data_2[offsets[1]];
 
-    thread_idx += num_threads();
+    thread_idx += num_threads;
   }
 
   // compute and store
   thread_idx = threadIdx.x;
   #pragma unroll
-  for (int i = 0; i < thread_work_size(); i++) {
+  for (int i = 0; i < thread_work_size; i++) {
     if (thread_idx >= remaining) {
       break;
     }
     int input_idx = thread_idx + base_index;
     auto offsets = out_calc.get(input_idx);
     output_data[offsets[0]] = f(state, inputs_1[i], inputs_2[i]);
-    thread_idx += num_threads();
+    thread_idx += num_threads;
   }
 }
 
@@ -254,16 +254,16 @@ void distribution_binary_kernel(TensorIterator &iter, PhiloxCudaState philox_arg
   const input_t_1 *input_data_1 = static_cast<const input_t_1 *>(iter.data_ptr(1));
   const input_t_2 *input_data_2 = static_cast<const input_t_2 *>(iter.data_ptr(2));
 
-  int64_t grid = (numel + block_work_size() - 1) / block_work_size();
+  int64_t grid = (numel + block_work_size - 1) / block_work_size;
   auto stream = at::cuda::getCurrentCUDAStream();
 
   if (iter.is_contiguous()) {
-    distribution_binary_elementwise_kernel<<<grid,num_threads(), 0, stream>>>(
+    distribution_binary_elementwise_kernel<<<grid,num_threads, 0, stream>>>(
         numel, f, philox_args, output_data, input_data_1, input_data_2,
         TrivialOffsetCalculator<2>(), TrivialOffsetCalculator<1>());
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
-    distribution_binary_elementwise_kernel<<<grid, num_threads(), 0, stream>>>(
+    distribution_binary_elementwise_kernel<<<grid, num_threads, 0, stream>>>(
         numel, f, philox_args, output_data, input_data_1, input_data_2,
         make_input_offset_calculator<2>(iter), make_output_offset_calculator(iter));
     C10_CUDA_KERNEL_LAUNCH_CHECK();
 
@@ -85,7 +85,7 @@ void _compute_linear_combination_internal_kernel(
     }
   };
 
-  _lauch_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
+  _lauch_kernel<num_threads, thread_work_size>(iter.numel(), loop);
 }
 
 void _compute_linear_combination_cuda_kernel(
 
@@ -137,7 +137,7 @@ void _unpack_pivots_internal_kernel(
     }
   };
 
-  _launch_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
+  _launch_kernel<num_threads, thread_work_size>(iter.numel(), loop);
 }
 
 void unpack_pivots_cuda_kernel(
 
@@ -9,9 +9,13 @@
 
 #include <thrust/tuple.h>
 
-constexpr int num_threads() { return C10_WARP_SIZE * 4; }
-constexpr int thread_work_size() { return 4; }
-constexpr int block_work_size() { return thread_work_size() * num_threads(); }
+#define NUM_THREADS (C10_WARP_SIZE * 2)
+#define THREAD_WORK_SIZE 4
+#define BLOCK_WORK_SIZE (THREAD_WORK_SIZE * num_threads)
+
+constexpr int num_threads = NUM_THREADS;
+constexpr int thread_work_size = THREAD_WORK_SIZE;
+constexpr int block_work_size = BLOCK_WORK_SIZE;
 
 #include <ATen/native/cuda/MemoryAccess.cuh>
 
@@ -51,15 +55,15 @@ __device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) {
 
   int idx = blockIdx.x;
 
-  return_t results[thread_work_size()];
-  args_t args[thread_work_size()];
+  return_t results[thread_work_size];
+  args_t args[thread_work_size];
 
   // load
   policy.load(args, idx);
 
   // compute
   #pragma unroll
-  for (int i = 0; i < thread_work_size(); i++) {
+  for (int i = 0; i < thread_work_size; i++) {
     if (policy.check_inbounds(i)) {
       results[i] = c10::guts::apply(f, args[i]);
     }
@@ -205,18 +209,18 @@ template <typename T> struct is_tuple: std::false_type {};
 template <typename ...T> struct is_tuple<thrust::tuple<T...>>: std::true_type {};
 
 template <int num_outputs, typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t>
-C10_LAUNCH_BOUNDS_1(num_threads())
+C10_LAUNCH_BOUNDS_1(num_threads)
 __global__ void unrolled_elementwise_kernel_for_multi_outputs(int N, func_t f, array_t data, inp_calc_t ic, out_calc_t oc) {
-  int remaining = N - block_work_size() * blockIdx.x;
+  int remaining = N - block_work_size * blockIdx.x;
   elementwise_kernel_helper(f, memory::policies::multi_outputs_unroll<array_t, inp_calc_t, out_calc_t, num_outputs>(data, remaining, ic, oc));
 }
 
 template <int num_outputs, typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t>
 static inline void launch_unrolled_kernel_for_multi_outputs(int64_t N, const func_t& f, array_t data, inp_calc_t ic, out_calc_t oc) {
   TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
-  int64_t grid = (N + block_work_size() - 1) / block_work_size();
+  int64_t grid = (N + block_work_size - 1) / block_work_size;
   auto stream = at::cuda::getCurrentCUDAStream();
-  unrolled_elementwise_kernel_for_multi_outputs<num_outputs, func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data, ic, oc);
+  unrolled_elementwise_kernel_for_multi_outputs<num_outputs, func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data, ic, oc);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 
@@ -59,7 +59,7 @@ struct vectorized_load_helper {
     using arg_t = std::tuple_element_t<arg_index, args_t>;
     // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
     // need a +1 offset to get the input
-    auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + block_work_size() * idx;
+    auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + block_work_size * idx;
     auto args_accessor = [&args] __device__ (int thread_unroll_idx) -> arg_t & { return std::get<arg_index>(args[thread_unroll_idx]); };
     self.load_single_arg(args_accessor, ptr);
   }
@@ -164,38 +164,38 @@ struct unroll {
     data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc), loader(l), storer(s) {}
 
   __device__ inline bool check_inbounds(int thread_work_elem) {
-    return ((threadIdx.x  + thread_work_elem*num_threads()) < remaining);
+    return ((threadIdx.x  + thread_work_elem*num_threads) < remaining);
   }
 
   template<typename args_t>
   __device__ inline void load(args_t *args, int idx) {
     constexpr int arity = std::tuple_size<args_t>::value;
     int thread_idx = threadIdx.x;
     #pragma unroll
-    for (int i = 0; i < thread_work_size(); i++) {
+    for (int i = 0; i < thread_work_size; i++) {
       if (thread_idx >= remaining) {
         return;
       }
-      int linear_idx = thread_idx + block_work_size() * idx;
+      int linear_idx = thread_idx + block_work_size * idx;
       auto offset = input_offset_calculator.get(linear_idx);
       detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
-      thread_idx += num_threads();
+      thread_idx += num_threads;
     }
   }
 
   template<typename scalar_t>
   __device__ inline void store(scalar_t *from, int idx) {
     int thread_idx = threadIdx.x;
-    scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size() * idx;
+    scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size * idx;
     #pragma unroll
-    for (int i = 0; i < thread_work_size(); i++) {
+    for (int i = 0; i < thread_work_size; i++) {
       if (thread_idx >= remaining) {
         return;
       }
-      int linear_idx = thread_idx + block_work_size() * idx;
+      int linear_idx = thread_idx + block_work_size * idx;
       int offset = output_offset_calculator.get(linear_idx)[0];
       storer.store(from[i], data[0], offset);
-      thread_idx += num_threads();
+      thread_idx += num_threads;
     }
   }
 };
@@ -208,8 +208,8 @@ struct unroll {
 template <int vec_size, typename data_t>  // vec_size: number of scalars, can be 1, 2, or 4.
 struct vectorized {
 
-  static_assert(thread_work_size() % vec_size == 0, "The workload per thread must be a multiple of vec_size");
-  static constexpr int loop_size = thread_work_size() / vec_size;
+  static_assert(thread_work_size % vec_size == 0, "The workload per thread must be a multiple of vec_size");
+  static constexpr int loop_size = thread_work_size / vec_size;
 
   data_t data;
 
@@ -226,7 +226,7 @@ struct vectorized {
     int thread_idx = threadIdx.x;
     #pragma unroll
     for (int i = 0; i < loop_size; i++) {
-      int index = thread_idx + i * num_threads();
+      int index = thread_idx + i * num_threads;
       vec_t v = from_[index];
       #pragma unroll
       for (int j = 0; j < vec_size; j++) {
@@ -244,12 +244,12 @@ struct vectorized {
   template<typename scalar_t>
   __device__ inline void store(scalar_t *from, int idx) {
     using vec_t = aligned_vector<scalar_t, vec_size>;
-    scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size() * idx;
+    scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size * idx;
     vec_t *to_ = reinterpret_cast<vec_t *>(to);
     int thread_idx = threadIdx.x;
     #pragma unroll
     for (int i = 0; i < loop_size; i++) {
-      int index = thread_idx + i * num_threads();
+      int index = thread_idx + i * num_threads;
       vec_t v;
       for (int j = 0; j < vec_size; j++) {
         v.val[j] = from[vec_size * i + j];
@@ -274,22 +274,22 @@ struct multi_outputs_unroll {
   data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {}
 
   __device__ inline bool check_inbounds(int thread_work_elem) {
-    return ((threadIdx.x  + thread_work_elem*num_threads()) < remaining);
+    return ((threadIdx.x  + thread_work_elem*num_threads) < remaining);
   }
 
   template<typename args_t>
   __device__ inline void load(args_t *args, int idx) {
     constexpr int arity = std::tuple_size<args_t>::value;
     int thread_idx = threadIdx.x;
     #pragma unroll
-    for (int i = 0; i < thread_work_size(); i++) {
+    for (int i = 0; i < thread_work_size; i++) {
       if (thread_idx >= remaining) {
         return;
       }
-      int linear_idx = thread_idx + block_work_size() * idx;
+      int linear_idx = thread_idx + block_work_size * idx;
       auto offset = input_offset_calculator.get(linear_idx);
       detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
-      thread_idx += num_threads();
+      thread_idx += num_threads;
     }
   }
 
@@ -298,14 +298,14 @@ struct multi_outputs_unroll {
   __device__ inline void store(return_t *from, int idx) {
     int thread_idx = threadIdx.x;
     #pragma unroll
-    for (int i = 0; i < thread_work_size(); i++) {
+    for (int i = 0; i < thread_work_size; i++) {
       if (thread_idx >= this->remaining) {
         return;
       }
-      int linear_idx = thread_idx + block_work_size() * idx;
+      int linear_idx = thread_idx + block_work_size * idx;
       auto offsets = this->output_offset_calculator.get(linear_idx);
       memory::detail::static_unroll<detail::multi_outputs_store_helper, num_outputs>::with_args(this->data, offsets, from[i]);
-      thread_idx += num_threads();
+      thread_idx += num_threads;
     }
   }
 };
Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ void _compute_linear_combination_internal_kernel(`
`85`	`85`	`}`
`86`	`86`	`};`
`87`	`87`
`88`		`- _lauch_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);`
	`88`	`+ _lauch_kernel<num_threads, thread_work_size>(iter.numel(), loop);`
`89`	`89`	`}`
`90`	`90`
`91`	`91`	`void _compute_linear_combination_cuda_kernel(`
Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,7 @@ void _unpack_pivots_internal_kernel(`
`137`	`137`	`}`
`138`	`138`	`};`
`139`	`139`
`140`		`- _launch_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);`
	`140`	`+ _launch_kernel<num_threads, thread_work_size>(iter.numel(), loop);`
`141`	`141`	`}`
`142`	`142`
`143`	`143`	`void unpack_pivots_cuda_kernel(`