Merge pull request #93 from iotamudelta/batchnorm

iotamudelta · web-flow · commit 262b3c6117c2 · 2018-08-02T15:56:40.000-05:00
Batchnorm
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
@@ -28,11 +28,7 @@ namespace at { namespace native {
 
 template<int nt, int vt, typename func_t>
 __launch_bounds__(nt, 4)
-#ifdef __HIP_PLATFORM_HCC__
-__global__ void elementwise_kernel(int N, const func_t& f) {
-#else
 __global__ void elementwise_kernel(int N, func_t f) {
-#endif
   int tid = threadIdx.x;
   int nv = nt * vt;
   int idx = nv * blockIdx.x + tid;
diff --git a/aten/src/THCUNN/BatchNormalization.cu b/aten/src/THCUNN/BatchNormalization.cu
@@ -7,14 +7,26 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
+#if defined(__HIP_PLATFORM_HCC__)
+const int WARP_SIZE = 64;
+#else
 const int WARP_SIZE = 32;
+#endif
 
 // The maximum number of threads in a block
+#if defined(__HIP_PLATFORM_HCC__)
+const int MAX_BLOCK_SIZE = 256;
+#else
 const int MAX_BLOCK_SIZE = 512;
+#endif
 
 // Number of threads in a block given an input size up to MAX_BLOCK_SIZE
 static int getNumThreads(int nElem) {
+#if defined(__HIP_PLATFORM_HCC__)
+  int threadSizes[5] = { 16, 32, 64, 128, MAX_BLOCK_SIZE };
+#else
   int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
+#endif
   for (int i = 0; i != 5; ++i) {
     if (nElem <= threadSizes[i]) {
       return threadSizes[i];
@@ -116,7 +128,7 @@ __device__ T reduce(Op op, DeviceTensor3 tensor, int plane) {
   sum = warpSum(sum);
 
   // 'transpose', and reduce within warp again
-  __shared__ T shared[32];
+  __shared__ T shared[WARP_SIZE];
   __syncthreads();
   if (threadIdx.x % WARP_SIZE == 0) {
     shared[threadIdx.x / WARP_SIZE] = sum;
diff --git a/aten/src/THCUNN/generic/BatchNormalization.cu b/aten/src/THCUNN/generic/BatchNormalization.cu
@@ -64,7 +64,7 @@ void THNN_(BatchNormalization_updateOutput)(
     dim3 blocks(input.getSize(1));
     dim3 threads(getNumThreads(input.getSize(2)));
     BatchNormalizationUpdateOutput_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
-      input, output, weight, bias, eps, momentum, runningMean, runningVar,
+      input, output, weight, bias, static_cast<accreal>(eps), static_cast<accreal>(momentum), runningMean, runningVar,
       saveMean, saveStd);
   }
   THCudaCheck(cudaGetLastError());

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ void THNN_(BatchNormalization_updateOutput)(`
`64`	`64`	`dim3 blocks(input.getSize(1));`
`65`	`65`	`dim3 threads(getNumThreads(input.getSize(2)));`
`66`	`66`	`BatchNormalizationUpdateOutput_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(`
`67`		`- input, output, weight, bias, eps, momentum, runningMean, runningVar,`
	`67`	`+ input, output, weight, bias, static_cast<accreal>(eps), static_cast<accreal>(momentum), runningMean, runningVar,`
`68`	`68`	`saveMean, saveStd);`
`69`	`69`	`}`
`70`	`70`	`THCudaCheck(cudaGetLastError());`