8
8
// Borrowed from Theano
9
9
// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas
10
10
template <typename Dtype>
11
- __global__ void im3d2col_kernel (const int n, const Dtype* data_im,
12
- const int height, const int width, const int depth,
13
- const int kernel_h, const int kernel_w, const int kernel_d,
14
- const int pad_h, const int pad_w, const int pad_d,
15
- const int stride_h, const int stride_w, const int stride_d,
16
- const int height_col, const int width_col, const int depth_col,
11
+ __global__ void im3d2col_kernel (const int64_t n, const Dtype* data_im,
12
+ const int64_t height, const int64_t width, const int64_t depth,
13
+ const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d,
14
+ const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
15
+ const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
16
+ const int64_t height_col, const int64_t width_col, const int64_t depth_col,
17
17
Dtype* data_col)
18
18
{
19
19
CUDA_KERNEL_LOOP (index , n)
20
20
{
21
- int d_out = index % depth_col;
22
- int w_index = index / depth_col;
23
- int w_out = w_index % width_col;
24
- int h_index = w_index / width_col;
25
- int h_out = h_index % height_col;
21
+ int64_t d_out = index % depth_col;
22
+ int64_t w_index = index / depth_col;
23
+ int64_t w_out = w_index % width_col;
24
+ int64_t h_index = w_index / width_col;
25
+ int64_t h_out = h_index % height_col;
26
26
27
- int channel_in = h_index / height_col;
27
+ int64_t channel_in = h_index / height_col;
28
28
// channel_in = 1;
29
29
30
- int channel_out = channel_in * kernel_h * kernel_w * kernel_d;
30
+ int64_t channel_out = channel_in * kernel_h * kernel_w * kernel_d;
31
31
32
- int h_in = h_out * stride_h - pad_h;
33
- int w_in = w_out * stride_w - pad_w;
34
- int d_in = d_out * stride_d - pad_d;
32
+ int64_t h_in = h_out * stride_h - pad_h;
33
+ int64_t w_in = w_out * stride_w - pad_w;
34
+ int64_t d_in = d_out * stride_d - pad_d;
35
35
36
36
Dtype* data_col_ptr = data_col;
37
37
data_col_ptr += channel_out * (height_col * width_col * depth_col) +
@@ -41,15 +41,15 @@ __global__ void im3d2col_kernel(const int n, const Dtype* data_im,
41
41
data_im_ptr += channel_in * (height * width * depth) +
42
42
h_in * (width * depth) + w_in * depth + d_in;
43
43
44
- for (int i = 0 ; i < kernel_h; ++i)
44
+ for (int64_t i = 0 ; i < kernel_h; ++i)
45
45
{
46
- int h = h_in + i;
47
- for (int j = 0 ; j < kernel_w; ++j)
46
+ int64_t h = h_in + i;
47
+ for (int64_t j = 0 ; j < kernel_w; ++j)
48
48
{
49
- int w = w_in + j;
50
- for (int k = 0 ; k < kernel_d; ++k)
49
+ int64_t w = w_in + j;
50
+ for (int64_t k = 0 ; k < kernel_d; ++k)
51
51
{
52
- int d = d_in + k;
52
+ int64_t d = d_in + k;
53
53
*data_col_ptr = (h >= 0 && w >= 0 && d >= 0 &&
54
54
h < height && w < width && d < depth) ?
55
55
data_im_ptr[i * (width * depth) + j *depth + k] : ScalarConvert<int , Dtype>::to (0 );
@@ -61,19 +61,19 @@ __global__ void im3d2col_kernel(const int n, const Dtype* data_im,
61
61
}
62
62
63
63
template <typename Dtype>
64
- void im3d2col (cudaStream_t stream, const Dtype* data_im, const int channels,
65
- const int height, const int width, const int depth,
66
- const int kernel_h, const int kernel_w, const int kernel_d,
67
- const int pad_h, const int pad_w, const int pad_d,
68
- const int stride_h, const int stride_w, const int stride_d,
64
+ void im3d2col (cudaStream_t stream, const Dtype* data_im, const int64_t channels,
65
+ const int64_t height, const int64_t width, const int64_t depth,
66
+ const int64_t kernel_h, const int64_t kernel_w, const int64_t kernel_d,
67
+ const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
68
+ const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
69
69
Dtype* data_col)
70
70
{
71
71
// We are going to launch channels * height_col * width_col * depth_col kernels, each
72
72
// kernel responsible for copying a single-channel grid.
73
- int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1 ;
74
- int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1 ;
75
- int depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1 ;
76
- int num_kernels = channels * height_col * width_col * depth_col;
73
+ int64_t height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1 ;
74
+ int64_t width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1 ;
75
+ int64_t depth_col = (depth + 2 * pad_d - kernel_d) / stride_d + 1 ;
76
+ int64_t num_kernels = channels * height_col * width_col * depth_col;
77
77
im3d2col_kernel<<<GET_BLOCKS(num_kernels),
78
78
CUDA_NUM_THREADS, 0 , stream>>> (num_kernels, data_im,
79
79
height, width, depth,
@@ -86,42 +86,42 @@ void im3d2col(cudaStream_t stream, const Dtype* data_im, const int channels,
86
86
}
87
87
88
88
template <typename Dtype, typename Acctype>
89
- __global__ void col2im3d_kernel (const int n, const Dtype* data_col,
90
- const int height, const int width, const int depth,
91
- const int channels,
92
- const int patch_h, const int patch_w, const int patch_d,
93
- const int pad_h, const int pad_w, const int pad_d,
94
- const int stride_h, const int stride_w, const int stride_d,
95
- const int height_col, const int width_col, const int depth_col,
89
+ __global__ void col2im3d_kernel (const int64_t n, const Dtype* data_col,
90
+ const int64_t height, const int64_t width, const int64_t depth,
91
+ const int64_t channels,
92
+ const int64_t patch_h, const int64_t patch_w, const int64_t patch_d,
93
+ const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
94
+ const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
95
+ const int64_t height_col, const int64_t width_col, const int64_t depth_col,
96
96
Dtype* data_im)
97
97
{
98
98
CUDA_KERNEL_LOOP (index , n)
99
99
{
100
100
Acctype val = 0 ;
101
- int d = index % depth + pad_d;
102
- int w_index = index / depth;
103
- int w = w_index % width + pad_w;
104
- int h_index = w_index / width;
105
- int h = h_index % height + pad_h;
106
- int c = h_index / height;
101
+ int64_t d = index % depth + pad_d;
102
+ int64_t w_index = index / depth;
103
+ int64_t w = w_index % width + pad_w;
104
+ int64_t h_index = w_index / width;
105
+ int64_t h = h_index % height + pad_h;
106
+ int64_t c = h_index / height;
107
107
108
108
// compute the start and end of the output
109
- int d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1 ;
110
- int d_col_end = min (d / stride_d + 1 , depth_col);
111
- int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1 ;
112
- int w_col_end = min (w / stride_w + 1 , width_col);
113
- int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1 ;
114
- int h_col_end = min (h / stride_h + 1 , height_col);
109
+ int64_t d_col_start = (d < patch_d) ? 0 : (d - patch_d) / stride_d + 1 ;
110
+ int64_t d_col_end = min (d / stride_d + 1 , depth_col);
111
+ int64_t w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1 ;
112
+ int64_t w_col_end = min (w / stride_w + 1 , width_col);
113
+ int64_t h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1 ;
114
+ int64_t h_col_end = min (h / stride_h + 1 , height_col);
115
115
116
- int offset =
116
+ int64_t offset =
117
117
(c * patch_h * patch_w * patch_d + h * patch_w * patch_d + w * patch_d + d) * height_col * width_col * depth_col;
118
118
119
- int coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
120
- int coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
121
- int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
122
- for (int d_col = d_col_start; d_col < d_col_end; ++d_col)
123
- for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
124
- for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
119
+ int64_t coeff_h_col = (1 - stride_h * patch_w * patch_d * height_col) * width_col * depth_col;
120
+ int64_t coeff_w_col = (1 - stride_w * patch_d * height_col * width_col) * depth_col;
121
+ int64_t coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
122
+ for (int64_t d_col = d_col_start; d_col < d_col_end; ++d_col)
123
+ for (int64_t h_col = h_col_start; h_col < h_col_end; ++h_col) {
124
+ for (int64_t w_col = w_col_start; w_col < w_col_end; ++w_col) {
125
125
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
126
126
}
127
127
}
@@ -130,17 +130,17 @@ __global__ void col2im3d_kernel(const int n, const Dtype* data_col,
130
130
}
131
131
132
132
template <typename Dtype, typename Acctype>
133
- void col2im3d (cudaStream_t stream, const Dtype* data_col, const int channels,
134
- const int height, const int width, const int depth,
135
- const int patch_h, const int patch_w, const int patch_d,
136
- const int pad_h, const int pad_w, const int pad_d,
137
- const int stride_h, const int stride_w, const int stride_d,
133
+ void col2im3d (cudaStream_t stream, const Dtype* data_col, const int64_t channels,
134
+ const int64_t height, const int64_t width, const int64_t depth,
135
+ const int64_t patch_h, const int64_t patch_w, const int64_t patch_d,
136
+ const int64_t pad_h, const int64_t pad_w, const int64_t pad_d,
137
+ const int64_t stride_h, const int64_t stride_w, const int64_t stride_d,
138
138
Dtype* data_im)
139
139
{
140
- int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1 ;
141
- int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1 ;
142
- int depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1 ;
143
- int num_kernels = channels * height * width * depth;
140
+ int64_t height_col = (height + 2 * pad_h - patch_h) / stride_h + 1 ;
141
+ int64_t width_col = (width + 2 * pad_w - patch_w) / stride_w + 1 ;
142
+ int64_t depth_col = (depth + 2 * pad_d - patch_d) / stride_d + 1 ;
143
+ int64_t num_kernels = channels * height * width * depth;
144
144
145
145
// To avoid involving atomic operations, we will launch one kernel per
146
146
// bottom dimension, and then in the kernel add up the top dimensions.
0 commit comments