Skip to content

Commit b6ea471

Browse files
committed
Revert "group conv with NHWC layout (pytorch#10585)"
This reverts commit cc53807.
1 parent 5122250 commit b6ea471

File tree

6 files changed

+116
-236
lines changed

6 files changed

+116
-236
lines changed

caffe2/operators/conv_op.h

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,8 @@ class ConvOp final : public ConvPoolOpBase<Context> {
1919
// Since this is the default convolution implementation, we will
2020
// use CAFFE_ENFORCE instead of OPERATOR_NEEDS_FEATURE.
2121
CAFFE_ENFORCE(
22-
(group_ == 1 || order_ == StorageOrder::NCHW ||
23-
std::is_same<Context, CPUContext>::value),
24-
"Group convolution only supports NCHW order or CPUContext right now.");
22+
group_ == 1 || order_ == StorageOrder::NCHW,
23+
"Group convolution only supports NCHW order right now.");
2524

2625
// Create shared buffer mutex in the constructor
2726
// to avoid race-condition in DAGNet.
@@ -75,9 +74,8 @@ class ConvGradientOp final : public ConvPoolOpBase<Context> {
7574
!(no_bias_ && OutputSize() == 3),
7675
"If bias is not present, you should not have 3 grad output.");
7776
CAFFE_ENFORCE(
78-
(group_ == 1 || order_ == StorageOrder::NCHW ||
79-
std::is_same<Context, CPUContext>::value),
80-
"Group convolution only supports NCHW order or CPUContext right now.");
77+
group_ == 1 || order_ == StorageOrder::NCHW,
78+
"Group convolution only supports NCHW order right now.");
8179
}
8280
~ConvGradientOp() {}
8381

caffe2/operators/conv_op_impl.h

Lines changed: 58 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -199,11 +199,11 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
199199
const int M = filter.dim32(0);
200200
CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
201201
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w());
202-
CAFFE_ENFORCE_EQ(filter.dim32(3), C / group_);
202+
CAFFE_ENFORCE_EQ(filter.dim32(3), C);
203203

204204
ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
205205
// The dimension of each kernel
206-
const int kernel_dim = kernel_h() * kernel_w() * (C / group_);
206+
const int kernel_dim = kernel_h() * kernel_w() * C;
207207
// The offset corresponding to a single input image, and a single output
208208
// image.
209209
const int input_offset = H * W * C;
@@ -224,7 +224,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
224224
}
225225
// Specialized path for 1 by 1 convolution with stride 1, pad 0 - we
226226
// can skip im2col.
227-
if (kernel_dim == (C / group_) && !HasPad() && !HasStride()) {
227+
if (kernel_dim == C && !HasPad() && !HasStride()) {
228228
const int HxW = X.size() / (N * C);
229229
if (bias_data != nullptr) {
230230
ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
@@ -260,26 +260,20 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
260260
stride_w(),
261261
X_data,
262262
col_buffer_data,
263-
&context_,
264-
group_);
263+
&context_);
265264
// Weight term
266-
for (int group_id = 0; group_id < group_; ++group_id) {
267-
math::GemmEx<T, Context>(
268-
CblasNoTrans,
269-
CblasTrans,
270-
output_image_size,
271-
M / group_,
272-
kernel_dim,
273-
1,
274-
col_buffer_data + group_id * kernel_dim,
275-
group_ * kernel_dim,
276-
filter_data + group_id * (M / group_) * kernel_dim,
277-
kernel_dim,
278-
0,
279-
Y_data + group_id * (M / group_),
280-
M,
281-
&context_);
282-
}
265+
math::Gemm<T, Context>(
266+
CblasNoTrans,
267+
CblasTrans,
268+
output_image_size,
269+
M,
270+
kernel_dim,
271+
1,
272+
col_buffer_data,
273+
filter_data,
274+
0,
275+
Y_data,
276+
&context_);
283277
if (bias_data != nullptr) {
284278
// Bias term
285279
math::Gemm<T, Context>(
@@ -400,24 +394,19 @@ bool ConvOp<T, Context>::Run1x1ConvOnDeviceWithOrderNHWC(
400394
const T* bias,
401395
T* Y) {
402396
const int G = group_;
403-
const int kernel_dim = C / G;
404-
for (int group_id = 0; group_id < group_; ++group_id) {
405-
math::GemmEx<T, Context>(
406-
CblasNoTrans,
407-
CblasTrans,
408-
N * HxW,
409-
M / group_,
410-
kernel_dim,
411-
1.0f,
412-
X + group_id * kernel_dim,
413-
C,
414-
filter + group_id * (M / group_) * kernel_dim,
415-
kernel_dim,
416-
0.0f,
417-
Y + group_id * (M / group_),
418-
M,
419-
&context_);
420-
}
397+
CAFFE_ENFORCE_EQ(G, 1);
398+
math::Gemm<T, Context>(
399+
CblasNoTrans,
400+
CblasTrans,
401+
N * HxW,
402+
M,
403+
C,
404+
1.0f,
405+
X,
406+
filter,
407+
0.0f,
408+
Y,
409+
&context_);
421410
if (bias != nullptr) {
422411
const T* bias_multiplier_data = bias_multiplier_.template data<T>();
423412
math::Gemm<T, Context>(
@@ -657,11 +646,11 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
657646
const int M = filter.dim32(0);
658647
CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
659648
CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w());
660-
CAFFE_ENFORCE_EQ(filter.dim32(3), C / group_);
649+
CAFFE_ENFORCE_EQ(filter.dim32(3), C);
661650
dfilter->ResizeLike(filter);
662651

663652
// The dimension of each kernel
664-
const int kernel_dim = kernel_h() * kernel_w() * (C / group_);
653+
const int kernel_dim = kernel_h() * kernel_w() * C;
665654
// The offset corresponding to a single input image, and a single output
666655
// image.
667656
const int input_offset = H * W * C;
@@ -670,7 +659,7 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
670659
const int output_image_size = dY.dim32(1) * dY.dim32(2);
671660
// The col buffer is stored in CHW order as well - kernel_dim, and the height
672661
// and width.
673-
col_buffer_.Resize(output_image_size, group_ * kernel_dim);
662+
col_buffer_.Resize(output_image_size, kernel_dim);
674663

675664
const T* Xdata = X.template data<T>();
676665
const T* const filter_data = filter.template data<T>();
@@ -717,26 +706,20 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
717706
stride_w(),
718707
Xdata,
719708
col_buffer_data,
720-
&context_,
721-
group_);
709+
&context_);
722710
// Gradient with respect to filter.
723-
for (int group_id = 0; group_id < group_; ++group_id) {
724-
math::GemmEx<T, Context>(
725-
CblasTrans,
726-
CblasNoTrans,
727-
M / group_,
728-
kernel_dim,
729-
output_image_size,
730-
1,
731-
dYdata + output_offset * image_id + group_id * (M / group_),
732-
M,
733-
col_buffer_data + group_id * kernel_dim,
734-
group_ * kernel_dim,
735-
1,
736-
dfilter_data + group_id * (M / group_) * kernel_dim,
737-
kernel_dim,
738-
&context_);
739-
}
711+
math::Gemm<T, Context>(
712+
CblasTrans,
713+
CblasNoTrans,
714+
M,
715+
kernel_dim,
716+
output_image_size,
717+
1,
718+
dYdata + output_offset * image_id,
719+
col_buffer_data,
720+
1,
721+
dfilter_data,
722+
&context_);
740723
if (!no_bias_) {
741724
// Gradient with respect to bias
742725
math::Gemv<T, Context>(
@@ -760,23 +743,18 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
760743
T* dXdata = dX->template mutable_data<T>();
761744
for (int image_id = 0; image_id < N; ++image_id) {
762745
// Compute gradient into col_buffer.
763-
for (int group_id = 0; group_id < group_; ++group_id) {
764-
math::GemmEx<T, Context>(
765-
CblasNoTrans,
766-
CblasNoTrans,
767-
output_image_size,
768-
kernel_dim,
769-
M / group_,
770-
1,
771-
dYdata + output_offset * image_id + group_id * (M / group_),
772-
M,
773-
filter_data + group_id * (M / group_) * kernel_dim,
774-
kernel_dim,
775-
0,
776-
col_buffer_data + group_id * kernel_dim,
777-
group_ * kernel_dim,
778-
&context_);
779-
}
746+
math::Gemm<T, Context>(
747+
CblasNoTrans,
748+
CblasNoTrans,
749+
output_image_size,
750+
kernel_dim,
751+
M,
752+
1,
753+
dYdata + output_offset * image_id,
754+
filter_data,
755+
0,
756+
col_buffer_data,
757+
&context_);
780758
math::Col2Im<T, Context, StorageOrder::NHWC>(
781759
C,
782760
H,
@@ -793,8 +771,7 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
793771
stride_w(),
794772
col_buffer_data,
795773
dXdata,
796-
&context_,
797-
group_);
774+
&context_);
798775
dXdata += input_offset;
799776
}
800777
}

caffe2/python/operator_test/group_conv_test.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from hypothesis import assume, given, settings
77
import hypothesis.strategies as st
88

9-
from caffe2.proto import caffe2_pb2
109
from caffe2.python import core
1110
import caffe2.python.hypothesis_test_util as hu
1211

@@ -23,7 +22,8 @@ class TestGroupConvolution(hu.HypothesisTestCase):
2322
input_channels_per_group=st.integers(1, 8),
2423
output_channels_per_group=st.integers(1, 8),
2524
batch_size=st.integers(1, 3),
26-
order=st.sampled_from(["NCHW", "NHWC"]),
25+
# TODO(jiayq): if needed, add NHWC support.
26+
order=st.sampled_from(["NCHW"]),
2727
# Note: Eigen does not support group convolution, but it should
2828
# fall back to the default engine without failing.
2929
engine=st.sampled_from(["", "CUDNN", "EIGEN"]),
@@ -35,8 +35,6 @@ def test_group_convolution(
3535
input_channels_per_group, output_channels_per_group, batch_size,
3636
order, engine, use_bias, gc, dc):
3737
assume(size >= kernel)
38-
# TODO: Group conv in NHWC not implemented for GPU yet.
39-
assume(group == 1 or order == "NCHW" or gc.device_type != caffe2_pb2.CUDA)
4038
input_channels = input_channels_per_group * group
4139
output_channels = output_channels_per_group * group
4240

caffe2/utils/math.h

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -499,11 +499,6 @@ CAFFE2_API void Axpby(
499499
T* y,
500500
Context* context);
501501

502-
// groups must be 1 for GPU
503-
// For NHWC order with groups > 1, the result will be layout in
504-
// NHW G RS C/G order to make data within the same group to be contiguous.
505-
// For NCHW order, groups doesn't make any difference because we're doing Im2Col
506-
// for each N and C is the slowest moving dimension among CHW.
507502
template <typename T, class Context, StorageOrder kOrder>
508503
CAFFE2_API void Im2Col(
509504
const int channels,
@@ -521,8 +516,7 @@ CAFFE2_API void Im2Col(
521516
const int stride_w,
522517
const T* img_data,
523518
T* col_data,
524-
Context* context,
525-
const int groups = 1);
519+
Context* context);
526520

527521
template <typename T, class Context, StorageOrder kOrder>
528522
CAFFE2_API void Im2ColNd(
@@ -539,11 +533,6 @@ CAFFE2_API void Im2ColNd(
539533
T* col_data,
540534
Context* context);
541535

542-
// groups must be 1 for GPU
543-
// For NHWC order with groups > 1, the result will be layout in
544-
// NHW G RS C/G order to make data within the same group to be contiguous.
545-
// For NCHW order, groups doesn't make any difference because we're doing Im2Col
546-
// for each N and C is the slowest moving dimension among CHW.
547536
template <typename T, class Context, StorageOrder kOrder>
548537
CAFFE2_API void Col2Im(
549538
const int channels,
@@ -561,8 +550,7 @@ CAFFE2_API void Col2Im(
561550
const int stride_w,
562551
const T* col_data,
563552
T* img_data,
564-
Context* context,
565-
const int groups = 1);
553+
Context* context);
566554

567555
template <typename T, class Context, StorageOrder kOrder>
568556
CAFFE2_API void Col2ImNd(

0 commit comments

Comments
 (0)