fix bugs in syncbn (pytorch#46)

jeffdaily · web-flow · commit 3f49dbf01427 · 2021-01-25T14:34:19.000-08:00
- incorrect use of __shfl_down
- fix warp size assumptions
- update unit tests to exit on failure
diff --git a/csrc/welford.cu b/csrc/welford.cu
@@ -12,7 +12,7 @@
 #include "compat.h"
 
 #if defined __HIP_PLATFORM_HCC__
-#define SHFL_DOWN __shfl_down
+#define SHFL_DOWN(mask,val,i) __shfl_down(val, i)
 #else
 #define SHFL_DOWN __shfl_down_sync
 #endif
@@ -44,8 +44,11 @@ __host__ __forceinline__ int h_last_pow2(unsigned int n) {
     return n - (n >> 1);
 }
 
-
+#ifdef __HIP_PLATFORM_HCC__
+#define WARP_SIZE 64
+#else
 #define WARP_SIZE 32
+#endif
 
 template<typename T>
 __device__ __forceinline__ T warp_reduce_sum(T val)
@@ -61,25 +64,27 @@ __device__ __forceinline__ T reduce_block(T *x, T val)
 {
   int tid = threadIdx.y*blockDim.x + threadIdx.x;
   int blockSize = blockDim.x * blockDim.y;
+  int lane = tid % WARP_SIZE;
+  int wid = tid / WARP_SIZE;
 
-  if (blockSize > 32) {
+  if (blockSize > WARP_SIZE) {
     val = warp_reduce_sum(val);
-    if (tid % WARP_SIZE == 0)
-      x[tid/WARP_SIZE] = val;
+    if (lane == 0)
+      x[wid] = val;
 
     __syncthreads();
 
-    val = (tid < blockSize / WARP_SIZE? x[tid%WARP_SIZE] : T(0));
+    val = (tid < blockSize / WARP_SIZE? x[lane] : T(0));
   }
 
-  if(tid/WARP_SIZE==0) val = warp_reduce_sum(val);
+  if(wid==0) val = warp_reduce_sum(val);
 
   return val;
 }
 
 #define ELEMENTS_PER_ITER 4 // enables concurrency within each thread to hide latency
 #define ELEMENTS_PER_THREAD 16
-#define OPTIMAL_TILE_W 32
+#define OPTIMAL_TILE_W WARP_SIZE
 #define MAX_H_BLOCK 128
 #define MAX_BLOCK_SIZE 512
 
@@ -137,11 +142,7 @@ __device__ __forceinline__ void warp_reduce_mean_m2n(T &mean, T &m2n, int &num)
     auto num_new = SHFL_DOWN(0xffffffff, num, i);
     auto mean_new = SHFL_DOWN(0xffffffff, mean, i);
     auto m2n_new = SHFL_DOWN(0xffffffff, m2n, i);
-#if defined __HIP_PLATFORM_HCC__
-    welford_merge_element<T, int>(num, mean, m2n, num_new, mean_new, m2n_new);
-#else
     welford_merge_element(num, mean, m2n, num_new, mean_new, m2n_new);
-#endif
   }
 }
 
@@ -158,7 +159,7 @@ __device__ void welford_reduce_mean_m2n(
   int lane = thread_id % WARP_SIZE;
   int wid = thread_id / WARP_SIZE;
 
-  if (block_size > 32) {
+  if (block_size > WARP_SIZE) {
     warp_reduce_mean_m2n(mean, m2n, num);
     if (lane == 0) {
       x[wid*2] = mean;
@@ -265,6 +266,9 @@ __device__ __forceinline__ void merge_block_vertical(T& sum_dy,
 
 // welford kernel calculating mean/biased_variance/unbiased_variance
 template <typename scalar_t, typename accscalar_t, typename outscalar_t>
+#ifdef __HIP_PLATFORM_HCC__
+__launch_bounds__(MAX_BLOCK_SIZE)
+#endif
 __global__ void welford_kernel(
       const scalar_t* __restrict__ input,
       outscalar_t* __restrict__ out_mean,
@@ -291,8 +295,8 @@ __global__ void welford_kernel(
     }
   }
 
-  static __shared__ int s_mem[160];
-  accscalar_t* s_mem_ac = (accscalar_t*) &s_mem[32];
+  static __shared__ int s_mem[WARP_SIZE];
+  static __shared__ accscalar_t s_mem_ac[WARP_SIZE*2];
 
   welford_reduce_mean_m2n<accscalar_t>(s_mem_ac, s_mem, x_mean, m_2_n, count, block_size, thread_id);
 
@@ -304,6 +308,9 @@ __global__ void welford_kernel(
 
 // elementwise BN kernel
 template <typename scalar_t, typename accscalar_t, typename layerscalar_t>
+#ifdef __HIP_PLATFORM_HCC__
+__launch_bounds__(MAX_BLOCK_SIZE)
+#endif
 __global__ void batchnorm_forward_kernel(
       const scalar_t* __restrict__ input,
       const accscalar_t* __restrict__ mean,
@@ -331,6 +338,9 @@ __global__ void batchnorm_forward_kernel(
 // Breaking the grad_input to two step to support sync BN, which requires all
 // reduce of the intermediate results across processes.
 template <typename scalar_t, typename accscalar_t, typename layerscalar_t>
+#ifdef __HIP_PLATFORM_HCC__
+__launch_bounds__(MAX_BLOCK_SIZE)
+#endif
 __global__ void reduce_bn_kernel(
       const scalar_t* __restrict__ input,
       const scalar_t* __restrict__ grad_output,
@@ -343,7 +353,7 @@ __global__ void reduce_bn_kernel(
       const int bs,
       const int fs,
       const int ss) {
-  static __shared__ int s_mem[64];
+  static __shared__ int s_mem[WARP_SIZE];
   //int total_item_num = bs * ss;
 
   int thread_id = threadIdx.y*blockDim.x + threadIdx.x;
@@ -395,6 +405,9 @@ __global__ void reduce_bn_kernel(
 
 // elementwise backward BN kernel
 template <typename scalar_t, typename accscalar_t, typename layerscalar_t>
+#ifdef __HIP_PLATFORM_HCC__
+__launch_bounds__(MAX_BLOCK_SIZE)
+#endif
 __global__ void batchnorm_backward_kernel(
       const scalar_t* __restrict__ grad_output,
       const scalar_t* __restrict__ input,
@@ -434,6 +447,9 @@ template
     typename accscalar_t,
     typename outscalar_t,
     int PARALLEL_LOADS>
+#ifdef __HIP_PLATFORM_HCC__
+__launch_bounds__(MAX_BLOCK_SIZE)
+#endif
 __global__ void
 welford_kernel_c_last(
       const scalar_t* __restrict__ input,
@@ -575,6 +591,9 @@ welford_kernel_c_last(
 // parallel welford kernel to further reduce mean / biased_var
 // into mean / unbiased_var / inv_std across multiple processes.
 template <typename scalar_t>
+#ifdef __HIP_PLATFORM_HCC__
+__launch_bounds__(MAX_BLOCK_SIZE)
+#endif
 __global__ void welford_kernel_parallel(
       const scalar_t* __restrict__ mean,
       const scalar_t* __restrict__ var_biased,
@@ -608,6 +627,9 @@ template <
     typename accscalar_t,
     typename layerscalar_t,
     int PARALLEL_LOADS>
+#ifdef __HIP_PLATFORM_HCC__
+__launch_bounds__(MAX_BLOCK_SIZE)
+#endif
 __global__ void batchnorm_forward_c_last_kernel(
       const scalar_t* __restrict__ input,
       const scalar_t* __restrict__ z,
@@ -658,6 +680,9 @@ template <
     typename accscalar_t,
     typename layerscalar_t,
     int PARALLEL_LOADS>
+#ifdef __HIP_PLATFORM_HCC__
+__launch_bounds__(MAX_BLOCK_SIZE)
+#endif
 __global__ void relu_backward_c_last_kernel(
       const scalar_t* __restrict__ grad_output,
       const scalar_t* __restrict__ input,
@@ -708,6 +733,9 @@ template
     typename accscalar_t,
     typename layerscalar_t,
     int PARALLEL_LOADS>
+#ifdef __HIP_PLATFORM_HCC__
+__launch_bounds__(MAX_BLOCK_SIZE)
+#endif
 __global__ void reduce_bn_c_last_kernel(
       const scalar_t* __restrict__ input,
       const scalar_t* __restrict__ grad_output,
@@ -861,6 +889,9 @@ template <
     typename accscalar_t,
     typename layerscalar_t,
     int PARALLEL_LOADS>
+#ifdef __HIP_PLATFORM_HCC__
+__launch_bounds__(MAX_BLOCK_SIZE)
+#endif
 __global__ void batchnorm_backward_c_last_kernel(
       const scalar_t* __restrict__ grad_output,
       const scalar_t* __restrict__ input,
@@ -921,7 +952,7 @@ std::vector<at::Tensor> welford_mean_var_CUDA(const at::Tensor input) {
   at::Tensor out_var_biased = at::empty({feature_size}, input.options().dtype(scalar_type));
   at::Tensor out_mean = at::empty({feature_size}, input.options().dtype(scalar_type));
 
-  int block_y = min(h_last_pow2(batch_size), int(MAX_BLOCK_SIZE / 32));
+  int block_y = min(h_last_pow2(batch_size), int(MAX_BLOCK_SIZE / WARP_SIZE));
   int block_x = max(1, min(MAX_BLOCK_SIZE / block_y, h_last_pow2(space_size)));
   const dim3 block(block_x, block_y);
   const dim3 grid(feature_size);
@@ -957,7 +988,7 @@ at::Tensor batchnorm_forward_CUDA(
 
   auto space_size = get_tensor_spatial_size(input);
 
-  int block_x = max(32, min(MAX_BLOCK_SIZE, h_last_pow2(space_size)/4));
+  int block_x = max(WARP_SIZE, min(MAX_BLOCK_SIZE, h_last_pow2(space_size)/4));
   int block_y = max(1, min(MAX_BLOCK_SIZE/block_x, h_last_pow2(batch_size)/4));
   const dim3 block(block_x, block_y);
   int grid_z = max(1, min(65535, h_last_pow2(space_size)/4/block_x));
@@ -1030,7 +1061,7 @@ std::vector<at::Tensor> reduce_bn_CUDA(
 
   auto space_size = get_tensor_spatial_size(input);
 
-  int block_y = min(h_last_pow2(batch_size), int(MAX_BLOCK_SIZE/ 32));
+  int block_y = min(h_last_pow2(batch_size), int(MAX_BLOCK_SIZE/ WARP_SIZE));
   int block_x = max(1, min(MAX_BLOCK_SIZE/ block_y, h_last_pow2(space_size)));
   const dim3 block(block_x, block_y);
   const dim3 grid(feature_size);
@@ -1097,7 +1128,7 @@ at::Tensor batchnorm_backward_CUDA(
 
   auto space_size = get_tensor_spatial_size(input);
 
-  int block_x = max(32, min(MAX_BLOCK_SIZE, h_last_pow2(space_size)/4));
+  int block_x = max(WARP_SIZE, min(MAX_BLOCK_SIZE, h_last_pow2(space_size)/4));
   int block_y = max(1, min(MAX_BLOCK_SIZE/block_x, h_last_pow2(batch_size)/4));
   const dim3 block(block_x, block_y);
   int grid_z = max(1, min(65535, h_last_pow2(space_size)/4/block_x));
diff --git a/tests/distributed/synced_batchnorm/python_single_gpu_unit_test.py b/tests/distributed/synced_batchnorm/python_single_gpu_unit_test.py
@@ -109,3 +109,4 @@ def compare(desc, inp1, inp2, error):
 else:
     print("*SBN single gpu failed*")
 
+assert sbn_result
diff --git a/tests/distributed/synced_batchnorm/single_gpu_unit_test.py b/tests/distributed/synced_batchnorm/single_gpu_unit_test.py
@@ -157,3 +157,6 @@ def compare(desc, inp1, inp2, error):
     print("====SBN channel last single gpu passed tests")
 else:
     print("*SBN channel last single gpu failed*")
+
+assert sbn_result
+assert sbn_result_c_last
diff --git a/tests/distributed/synced_batchnorm/test_groups.py b/tests/distributed/synced_batchnorm/test_groups.py
@@ -60,7 +60,11 @@ def compare(desc, inp1, inp2, error):
 grad = np.random.randn(batch_size, feature_size, space_size, space_size).astype(dtype)
 weight = np.random.randn(feature_size).astype(dtype)
 bias = np.random.randn(feature_size).astype(dtype)
+#count = torch.cuda.IntTensor([batch_size*space_size**2])
+count = [ space_size**2 * ( (i+1) * batch_size // args.world_size - i * batch_size // args.world_size ) for i in range(0, args.world_size)]
+count = torch.cuda.IntTensor(count)
 
+print("--- count : " , count)
 
 type_tensor = torch.cuda.FloatTensor
 if args.fp16:
@@ -153,7 +157,7 @@ def compare(desc, inp1, inp2, error):
 grad_input_r = (grad_output2_r - mean_dy_r.view(-1, 1, 1) - (inp2_r - m.view(-1, 1, 1)) / (b_v.view(-1,1,1) + eps) * mean_dy_xmu_r.view(-1, 1, 1) ) * torch.rsqrt(b_v.view(-1,1,1) + eps) * weight_r.view(-1,1,1)
 
 mean_dy, mean_dy_xmu, grad_weight, grad_bias = syncbn.reduce_bn(grad_output_t, inp_t, mean, inv_std, weight_t)
-grad_input = syncbn.batchnorm_backward(grad_output_t, inp_t, mean, inv_std, weight_t, mean_dy, mean_dy_xmu)
+grad_input = syncbn.batchnorm_backward(grad_output_t, inp_t, mean, inv_std, weight_t, mean_dy, mean_dy_xmu, count)
 
 if args.local_rank == 0:
     sbn_result = compare("comparing bias grad: ", grad_bias, grad_bias_r, error) and sbn_result
diff --git a/tests/distributed/synced_batchnorm/two_gpu_unit_test.py b/tests/distributed/synced_batchnorm/two_gpu_unit_test.py
@@ -178,3 +178,5 @@ def compare(desc, inp1, inp2, error):
     print("====SBN two gpu passed tests")
 else:
     print("*SBN two gpu failed*")
+
+assert sbn_result
diff --git a/tests/distributed/synced_batchnorm/unit_test.sh b/tests/distributed/synced_batchnorm/unit_test.sh
@@ -1,8 +1,8 @@
-python python_single_gpu_unit_test.py
-python single_gpu_unit_test.py
-python test_batchnorm1d.py
-python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py
-python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py --fp16
-python -m torch.distributed.launch --nproc_per_node=2 two_gpu_test_different_batch_size.py --apex
+python python_single_gpu_unit_test.py || exit 1
+python single_gpu_unit_test.py || exit 1
+python test_batchnorm1d.py || exit 1
+python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py || exit 1
+python -m torch.distributed.launch --nproc_per_node=2 two_gpu_unit_test.py --fp16 || exit 1
+python -m torch.distributed.launch --nproc_per_node=2 two_gpu_test_different_batch_size.py --apex || exit 1
 #beware, you need a system with at least 4 gpus to test group_size<world_size
 #python -m torch.distributed.launch --nproc_per_node=4 test_groups.py --group_size=2

Original file line number	Diff line number	Diff line change
`@@ -109,3 +109,4 @@ def compare(desc, inp1, inp2, error):`
`109`	`109`	`else:`
`110`	`110`	`print("SBN single gpu failed")`
`111`	`111`
	`112`	`+assert sbn_result`