3d conv should use int64_t (pytorch#9274)

ssnl · Rob Kunkle · commit 082833f831f5 · 2018-08-15T11:32:05.000-07:00
Summary: Fixes pytorch#9264 . There can be so many elements in the output of `vol2col` so it overflows `int` range! This PR changes 3d conv to use `int64_t` mostly. Also fixes some unused var warning (cc goldsborough ) Pull Request resolved: pytorch#9274 Differential Revision: D8770682 Pulled By: SsnL fbshipit-source-id: f6e37f1aa56fe1009dd4c9bcbc042244e47252db
diff --git a/aten/src/THCUNN/VolumetricConvolution.cu b/aten/src/THCUNN/VolumetricConvolution.cu
@@ -8,30 +8,30 @@
 // Borrowed from Theano
 // Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas
 template <typename Dtype>
-__global__ void im3d2col_kernel(const int n, const Dtype* data_im,
-                                const int height, const int width, const int depth,
-                                const int kernel_h, const int kernel_w, const int kernel_d,
-                                const int pad_h, const int pad_w, const int pad_d,
-                                const int stride_h, const int stride_w, const int stride_d,
-                                const int height_col, const int width_col, const int depth_col,
+__global__ void im3d2col_kernel(const int64_t n, const Dtype* data_im,
+                                const int64_t height, const int64_t width, const int64_t depth,
+                                const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d,
+                                const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
+                                const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
+                                const int64_t height_col, const int64_t width_col, const int64_t depth_col,
                                 Dtype* data_col)
 {
   CUDA_KERNEL_LOOP(index, n)
   {
-    int d_out = index % depth_col;
-    int w_index = index / depth_col;
-    int w_out = w_index % width_col;
-    int h_index = w_index / width_col;
-    int h_out = h_index % height_col;
+    int64_t d_out = index % depth_col;
+    int64_t w_index = index / depth_col;
+    int64_t w_out = w_index % width_col;
+    int64_t h_index = w_index / width_col;
+    int64_t h_out = h_index % height_col;
 
-    int channel_in = h_index / height_col;
+    int64_t channel_in = h_index / height_col;
     //channel_in = 1;
 
-    int channel_out = channel_in * kernel_h * kernel_w * kernel_d;
+    int64_t channel_out = channel_in * kernel_h * kernel_w * kernel_d;
 
-    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride_w - pad_w;
-    int d_in = d_out * stride_d - pad_d;
+    int64_t h_in = h_out * stride_h - pad_h;
+    int64_t w_in = w_out * stride_w - pad_w;
+    int64_t d_in = d_out * stride_d - pad_d;
 
     Dtype* data_col_ptr = data_col;
     data_col_ptr += channel_out * (height_col * width_col * depth_col) +
@@ -41,15 +41,15 @@ __global__ void im3d2col_kernel(const int n, const Dtype* data_im,
     data_im_ptr += channel_in * (height * width * depth) +
       h_in * (width * depth) + w_in * depth + d_in;
 
-    for (int i = 0; i < kernel_h; ++i)
+    for (int64_t i = 0; i < kernel_h; ++i)
     {
-      int h = h_in + i;
-      for (int j = 0; j < kernel_w; ++j)
+      int64_t h = h_in + i;
+      for (int64_t j = 0; j < kernel_w; ++j)
       {
-        int w = w_in + j;
-        for (int k = 0; k < kernel_d; ++k)
+        int64_t w = w_in + j;
+        for (int64_t k = 0; k < kernel_d; ++k)
         {
-          int d = d_in + k;
+          int64_t d = d_in + k;
           *data_col_ptr = (h >= 0 && w >= 0 && d >= 0 &&
                            h < height && w < width && d < depth) ?
                            data_im_ptr[i * (width * depth) + j *depth + k] : ScalarConvert<int, Dtype>::to(0);
@@ -61,19 +61,19 @@ __global__ void im3d2col_kernel(const int n, const Dtype* data_im,
 }
 
 template <typename Dtype>
-void im3d2col(cudaStream_t stream, const Dtype* data_im, const int channels,
-              const int height, const int width, const int depth,
-              const int kernel_h, const int kernel_w, const int kernel_d,
-              const int pad_h, const int pad_w, const int pad_d,
-              const int stride_h, const int stride_w, const int stride_d,
+void im3d2col(cudaStream_t stream, const Dtype* data_im, const int64_t channels,
+              const int64_t height, const int64_t width, const int64_t depth,
+              const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d,
+              const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
+              const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
               Dtype* data_col)
 {
   // We are going to launch channels * height_col * width_col * depth_col kernels, each
   // kernel responsible for copying a single-channel grid.
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-  int depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1;
-  int num_kernels = channels * height_col * width_col * depth_col;
+  int64_t height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int64_t width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int64_t depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1;
+  int64_t num_kernels = channels * height_col * width_col * depth_col;
   im3d2col_kernel<<<GET_BLOCKS(num_kernels),
     CUDA_NUM_THREADS, 0, stream>>>(num_kernels, data_im,
                                    height, width, depth,
@@ -86,42 +86,42 @@ void im3d2col(cudaStream_t stream, const Dtype* data_im, const int channels,
 }
 
 template <typename Dtype, typename Acctype>
-__global__ void col2im3d_kernel(const int n, const Dtype* data_col,
-                                const int height, const int width, const int depth,
-                                const int channels,
-                                const int patch_h, const int patch_w, const int patch_d,
-                                const int pad_h, const int pad_w, const int pad_d,
-                                const int stride_h, const int stride_w, const int stride_d,
-                                const int height_col, const int width_col, const int depth_col,
+__global__ void col2im3d_kernel(const int64_t n, const Dtype* data_col,
+                                const int64_t height, const int64_t width, const int64_t depth,
+                                const int64_t channels,
+                                const int64_t patch_h, const int64_t patch_w, const int64_t patch_d,
+                                const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
+                                const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
+                                const int64_t height_col, const int64_t width_col, const int64_t depth_col,
                                 Dtype* data_im)
 {
   CUDA_KERNEL_LOOP(index, n)
   {
     Acctype val = 0;
-    int d = index % depth + pad_d;
-    int w_index = index / depth;
-    int w = w_index % width + pad_w;
-    int h_index = w_index / width;
-    int h = h_index % height + pad_h;
-    int c = h_index / height;
+    int64_t d = index % depth + pad_d;
+    int64_t w_index = index / depth;
+    int64_t w = w_index % width + pad_w;
+    int64_t h_index = w_index / width;
+    int64_t h = h_index % height + pad_h;
+    int64_t c = h_index / height;
 
     // compute the start and end of the output
-    int d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1;
-    int d_col_end = min(d / stride_d + 1, depth_col);
-    int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
-    int w_col_end = min(w / stride_w + 1, width_col);
-    int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
-    int h_col_end = min(h / stride_h + 1, height_col);
+    int64_t d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1;
+    int64_t d_col_end = min(d / stride_d + 1, depth_col);
+    int64_t w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
+    int64_t w_col_end = min(w / stride_w + 1, width_col);
+    int64_t h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
+    int64_t h_col_end = min(h / stride_h + 1, height_col);
 
-    int offset =
+    int64_t offset =
       (c * patch_h * patch_w * patch_d + h * patch_w * patch_d + w * patch_d + d) * height_col * width_col * depth_col;
 
-    int coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
-    int coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
-    int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
-    for (int d_col = d_col_start; d_col < d_col_end; ++d_col)
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+    int64_t coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
+    int64_t coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
+    int64_t coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
+    for (int64_t d_col = d_col_start; d_col < d_col_end; ++d_col)
+      for (int64_t h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int64_t w_col = w_col_start; w_col < w_col_end; ++w_col) {
           val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
       }
    }
@@ -130,17 +130,17 @@ __global__ void col2im3d_kernel(const int n, const Dtype* data_col,
 }
 
 template <typename Dtype, typename Acctype>
-void col2im3d(cudaStream_t stream, const Dtype* data_col, const int channels,
-              const int height, const int width, const int depth,
-              const int patch_h, const int patch_w, const int patch_d,
-              const int pad_h, const int pad_w, const int pad_d,
-              const int stride_h, const int stride_w, const int stride_d,
+void col2im3d(cudaStream_t stream, const Dtype* data_col, const int64_t channels,
+              const int64_t height, const int64_t width, const int64_t depth,
+              const int64_t patch_h, const int64_t patch_w, const int64_t patch_d,
+              const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
+              const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
               Dtype* data_im)
 {
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-  int depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1;
-  int num_kernels = channels * height * width * depth;
+  int64_t height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+  int64_t width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int64_t depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1;
+  int64_t num_kernels = channels * height * width * depth;
 
   // To avoid involving atomic operations, we will launch one kernel per
   // bottom dimension, and then in the kernel add up the top dimensions.
diff --git a/aten/src/THCUNN/generic/Im2Col.cu b/aten/src/THCUNN/generic/Im2Col.cu
@@ -31,8 +31,6 @@ static inline void THNN_(Im2Col_shapeCheck)(
   int inputWidth   = THCTensor_(size)(state, input, dim_batch + 3);
   int outputHeight = (inputHeight + 2 * padH - (dH * (kH - 1) + 1)) / sH + 1;
   int outputWidth  = (inputWidth + 2 * padW - (dW * (kW - 1) + 1)) / sW + 1;
-  int nOutputPlane = nInputPlane * kW * kH;
-  int outputLength = outputHeight * outputWidth;
 
   if (outputHeight < 1 || outputWidth < 1) {
     THError("Given input with spatial size (%d, %d), kernel_size=(%d, %d), "
diff --git a/aten/src/THCUNN/generic/VolumetricConvolution.cu b/aten/src/THCUNN/generic/VolumetricConvolution.cu
@@ -47,11 +47,11 @@ static inline void THNN_(VolumetricConvolution_shapeCheck)
   if (weight == NULL) {
     weight = gradWeight;
   }
-  int nOutputPlane = (int)weight->size[0];
-  int nInputPlane  = (int)weight->size[1];
-  int kT           = (int)weight->size[2];
-  int kH           = (int)weight->size[3];
-  int kW           = (int)weight->size[4];
+  int64_t nOutputPlane = weight->size[0];
+  int64_t nInputPlane  = weight->size[1];
+  int64_t kT           = weight->size[2];
+  int64_t kH           = weight->size[3];
+  int64_t kW           = weight->size[4];
 
   THArgCheck(kT > 0 && kW > 0 && kH > 0, 4,
              "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
@@ -267,11 +267,11 @@ void THNN_(VolumetricConvolution_updateGradInput)(
            int padT, int padW, int padH)
 {
 
-  int nOutputPlane = (int)weight->size[0];
-  int nInputPlane  = (int)weight->size[1];
-  int kT           = (int)weight->size[2];
-  int kH           = (int)weight->size[3];
-  int kW           = (int)weight->size[4];
+  int64_t nOutputPlane = weight->size[0];
+  int64_t nInputPlane  = weight->size[1];
+  int64_t kT           = weight->size[2];
+  int64_t kH           = weight->size[3];
+  int64_t kW           = weight->size[4];
 
   THCTensor *gradColumns = finput;
 
@@ -507,7 +507,7 @@ void THNN_(VolumetricConvolution_accGradParameters)(
       #endif
     }
   }
-  
+
   // Free
   THCTensor_(free)(state, input_n);
   THCTensor_(free)(state, gradOutput_n);
diff --git a/aten/src/THCUNN/generic/VolumetricDilatedMaxPooling.cu b/aten/src/THCUNN/generic/VolumetricDilatedMaxPooling.cu
@@ -377,7 +377,7 @@ void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
   THCDeviceTensor<THCIndex_t, 4> cudaIndices =
     toDeviceTensor<THCIndex_t, 4>(state, indices1);
 
-  int totalZ = outputTime * inputSlices * batchSize;
+  int64_t totalZ = outputTime * inputSlices * batchSize;
   int offsetZ = 0;
   dim3 block(32, 8);
 
diff --git a/aten/src/THNN/generic/Col2Im.c b/aten/src/THNN/generic/Col2Im.c
@@ -70,8 +70,8 @@ static void THNN_(im2col)(const real* data_im, const int channels,
     int h_offset = (c_col / kernel_w) % kernel_h;
     int c_im = c_col / kernel_h / kernel_w;
     for (int h_col = 0; h_col < height_col; ++h_col) {
+      int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
       for (int w_col = 0; w_col < width_col; ++w_col) {
-        int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
         int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
         data_col[(c_col * height_col + h_col) * width_col + w_col] =
           (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
@@ -98,8 +98,8 @@ static void THNN_(col2im)(const real* data_col, const int channels,
     int h_offset = (c_col / kernel_w) % kernel_h;
     int c_im = c_col / kernel_h / kernel_w;
     for (int h_col = 0; h_col < height_col; ++h_col) {
+      int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
       for (int w_col = 0; w_col < width_col; ++w_col) {
-        int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
         int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
         if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
           data_im[(c_im * height + h_im) * width + w_im] +=
diff --git a/aten/src/THNN/generic/VolumetricDilatedConvolution.c b/aten/src/THNN/generic/VolumetricDilatedConvolution.c
@@ -95,8 +95,8 @@ void THNN_(VolumetricDilatedConvolution_updateOutput)(
         dilationT, dilationH, dilationW, 0);
 
   // Params:
-  int nInputPlane = weight->size[1];
-  int nOutputPlane = weight->size[0];
+  int64_t nInputPlane = weight->size[1];
+  int64_t nOutputPlane = weight->size[0];
 
   input = THTensor_(newContiguous)(input);
   weight = THTensor_(newContiguous)(weight);
@@ -230,8 +230,8 @@ void THNN_(VolumetricDilatedConvolution_updateGradInput)(
         dilationT, dilationH, dilationW, 0);
 
   // Params
-  int nInputPlane = weight->size[1];
-  int nOutputPlane = weight->size[0];
+  int64_t nInputPlane = weight->size[1];
+  int64_t nOutputPlane = weight->size[0];
 
   input = THTensor_(newContiguous)(input);
   gradOutput = THTensor_(newContiguous)(gradOutput);
diff --git a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c