pytorch
diff --git a/‎docs/NewBackendSpecificNode.md
Lines changed: 0 additions & 41 deletions b/‎docs/NewBackendSpecificNode.md
Lines changed: 0 additions & 41 deletions
diff --git a/‎include/glow/Backends/LayoutConverter.h
Lines changed: 28 additions & 10 deletions b/‎include/glow/Backends/LayoutConverter.h
Lines changed: 28 additions & 10 deletions
diff --git a/‎include/glow/Graph/Graph.h
Lines changed: 32 additions & 24 deletions b/‎include/glow/Graph/Graph.h
Lines changed: 32 additions & 24 deletions
diff --git a/‎include/glow/Graph/Nodes.h
Lines changed: 3 additions & 0 deletions b/‎include/glow/Graph/Nodes.h
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/glow/IR/IRBuilder.h
Lines changed: 7 additions & 4 deletions b/‎include/glow/IR/IRBuilder.h
Lines changed: 7 additions & 4 deletions
diff --git a/‎lib/Backends/Interpreter/InterpreterNodes.cpp
Lines changed: 7 additions & 0 deletions b/‎lib/Backends/Interpreter/InterpreterNodes.cpp
Lines changed: 7 additions & 0 deletions
@@ -48,47 +48,6 @@ ReLU is max between zero and the input value. Glow lowers `ReLUNode` to two basi
 
 Please refer to the document in [Backend](https://github.com/pytorch/glow/blob/master/docs/Backends.md#backend-specific-nodes-and-instructions) part for source code details on adding a new backend-specific CPUMaxSplatNode on CPU.
 
-#### Data Layout Transformation for Conv Operator in OpenCL
-
-OpenCL Conv is faster in layout `NCHW`, but  the default layout of convolution operator in Glow is `NHWC`. So we transpose the inputs/output and replace the `ConvolutionNode` with a backend-specific `OCLConvolutionNode` that uses `NCHW`. The transposes mostly can get optimized away thanks to the high-level graph optimizations.
-
-The OpenCL backend defines `OCLConvolution` in `tools/ClassGen/OpenCL/OpenCLSpecificNodes.h` to support layout `NCHW` input.
-
-```cpp
-BB.newNode("OCLConvolution")
-    .addInput("Input")
-    .addInput("Filter")
-    .addInput("Bias")
-    .addMember(MemberType::VectorUnsigned, "Kernels")
-    .addMember(MemberType::VectorUnsigned, "Strides")
-    .addMember(MemberType::VectorUnsigned, "Pads")
-    .addMember(MemberType::Unsigned, "Group")
-    .addResultFromCtorArg()
-    .setDocstring(
-        "This is an OpenCL-specific convolution implementation where the "
-        "filter, the bias and the input are in the NCHW format");
-```
-
-During `transformPostLowering()`, this `convertConvToNCHWConv` node which contains a `NCHWConvNode` node and multiple`Transpose` nodes for `Input`, `Filter` and `Result` replaces the aforementioned pattern.
-
-A corresponding backend-specific `OCLConvolution` instruction is also needed, defined in
-`tools/ClassGen/Backends/OpenCL/OpenCLSpecificInstrs.h`:
-
-```cpp
-BB.newBackendSpecificInstr("OCLConvolution")
-    .addOperand("Dest", OperandKind::Out)
-    .addOperand("Src", OperandKind::In)
-    .addOperand("Filter", OperandKind::In)
-    .addOperand("Bias", OperandKind::In)
-    .addMember(MemberType::VectorUnsigned, "Kernels")
-    .addMember(MemberType::VectorUnsigned, "Strides")
-    .addMember(MemberType::VectorUnsigned, "Pads")
-    .addMember(MemberType::Unsigned, "Group")
-    .autoIRGen()
-    .autoVerify(VerifyKind::SameElementType, {"Dest", "Src", "Filter", "Bias"});
-
-```
-
 
 ### References
 
 
@@ -23,7 +23,6 @@ namespace glow {
 
 /// Convert regular convolution nodes (that use NHWC) into a backend-specific
 /// convolution nodes using NCHW.
-template <class NCHWConvNode>
 Node *convertConvToNCHWConv(ConvolutionNode *CN, Function *F) {
   // Convert filter and input from NHWC (Glow's default) into NCHW.
   auto *NI = F->createTranspose("conv.input", CN->getInput(), NHWC2NCHW);
@@ -34,30 +33,49 @@ Node *convertConvToNCHWConv(ConvolutionNode *CN, Function *F) {
   auto outTy = F->getParent()->uniqueTypeWithNewShape(CN->getResult().getType(),
                                                       dimsNCHW);
 
-  auto *NC = F->addNode(new NCHWConvNode(
-      CN->getName(), outTy, NI, NF, CN->getBias(), CN->getKernels(),
-      CN->getStrides(), CN->getPads(), CN->getGroup(), CN->getDilation()));
+  auto *NC = F->addNode(
+      new ConvolutionNode(CN->getName(), outTy, NI, NF, CN->getBias(),
+                          CN->getKernels(), CN->getStrides(), CN->getPads(),
+                          CN->getGroup(), CN->getDilation(), NCHW));
   auto *NR = F->createTranspose("conv.result", NC, NCHW2NHWC);
 
   return NR;
 }
 
 /// Convert regular pool nodes (that use NHWC) into backend-specific nodes using
 /// NCHW.
-template <class PoolNode, class NCHWPoolNode>
-Node *convertPoolToNCHWPool(PoolNode *PN, Function *F) {
+Node *convertMaxPoolToNCHWPool(MaxPoolNode *PN, Function *F) {
   // Convert input from NHWC (Glow's default) into NCHW.
-  auto *NI = F->createTranspose("conv.input", PN->getInput(), NHWC2NCHW);
+  auto *NI = F->createTranspose("maxpool.input", PN->getInput(), NHWC2NCHW);
+
+  auto dimsNHWC = ShapeNHWC(PN->getResult().getType()->dims());
+  auto dimsNCHW = {dimsNHWC.n, dimsNHWC.c, dimsNHWC.h, dimsNHWC.w};
+  auto outTy = F->getParent()->uniqueTypeWithNewShape(PN->getResult().getType(),
+                                                      dimsNCHW);
+  auto AMT = F->getParent()->uniqueTypeWithNewShape(PN->getArgmax().getType(),
+                                                    dimsNCHW);
+
+  auto *NPN = F->addNode(new MaxPoolNode(PN->getName(), outTy, AMT, NI,
+                                         PN->getKernels(), PN->getStrides(),
+                                         PN->getPads(), NCHW));
+  auto *NR = F->createTranspose("maxpool.result", NPN->getResult(), NCHW2NHWC);
+
+  return NR;
+}
+
+Node *convertAvgPoolToNCHWPool(AvgPoolNode *PN, Function *F) {
+  // Convert input from NHWC (Glow's default) into NCHW.
+  auto *NI = F->createTranspose("maxpool.input", PN->getInput(), NHWC2NCHW);
 
   auto dimsNHWC = ShapeNHWC(PN->getResult().getType()->dims());
   auto dimsNCHW = {dimsNHWC.n, dimsNHWC.c, dimsNHWC.h, dimsNHWC.w};
   auto outTy = F->getParent()->uniqueTypeWithNewShape(PN->getResult().getType(),
                                                       dimsNCHW);
 
   auto *NPN =
-      F->addNode(new NCHWPoolNode(PN->getName(), outTy, NI, PN->getKernels()[0],
-                                  PN->getStrides()[0], PN->getPads()));
-  auto *NR = F->createTranspose("maxpool.result", NPN, NCHW2NHWC);
+      F->addNode(new AvgPoolNode(PN->getName(), outTy, NI, PN->getKernels(),
+                                 PN->getStrides(), PN->getPads(), NCHW));
+  auto *NR = F->createTranspose("avgpool.result", NPN->getResult(), NCHW2NHWC);
 
   return NR;
 }
 
@@ -341,14 +341,15 @@ class Function final : public Named {
   /// \p group defines the number of groups the input and output channels should
   /// be divided into and convolved separately. \p dilation defines factor by
   /// which gap between 2 neighboring kernel elements is expanded along each
-  /// axis.
+  /// axis. \p layout defines the Tensor layout and must be either NHWC or NCHW.
 
-  ConvolutionNode *createConv(llvm::StringRef name, NodeValue input,
-                              NodeValue filter, NodeValue bias, TypeRef outTy,
-                              llvm::ArrayRef<unsigned_t> kernels,
-                              llvm::ArrayRef<unsigned_t> strides,
-                              llvm::ArrayRef<unsigned_t> pads, unsigned_t group,
-                              unsigned_t dilation = 1);
+  ConvolutionNode *
+  createConv(llvm::StringRef name, NodeValue input, NodeValue filter,
+             NodeValue bias, TypeRef outTy, llvm::ArrayRef<unsigned_t> kernels,
+             llvm::ArrayRef<unsigned_t> strides,
+             llvm::ArrayRef<unsigned_t> pads, unsigned_t group,
+             unsigned_t dilation = 1,
+             ConvolutionLayout layout = ConvolutionLayout::NHWC);
 
   /// Creates a ConvolutionNode with the given \p name which convolves the 4D
   /// \p input with \p filter and \bias. \p kernel defines the size of the
@@ -358,13 +359,14 @@ class Function final : public Named {
   /// \p group defines the number of groups the input and output channels should
   /// be divided into and convolved separately. \p dilation defines factor by
   /// which gap between 2 neighboring kernel elements is expanded along each
-  /// axis.
+  /// axis. \p layout defines the Tensor layout and must be either NHWC or NCHW.
 
-  ConvolutionNode *createConv(llvm::StringRef name, NodeValue input,
-                              NodeValue filter, NodeValue bias, TypeRef outTy,
-                              unsigned_t kernel, unsigned_t stride,
-                              unsigned_t pad, unsigned_t group,
-                              unsigned_t dilation = 1);
+  ConvolutionNode *
+  createConv(llvm::StringRef name, NodeValue input, NodeValue filter,
+             NodeValue bias, TypeRef outTy, unsigned_t kernel,
+             unsigned_t stride, unsigned_t pad, unsigned_t group,
+             unsigned_t dilation = 1,
+             ConvolutionLayout layout = ConvolutionLayout::NHWC);
 
   /// Creates a Convolution3DNode with the given \p name which convolves the 5D
   /// \p input with \p filter and \bias. \p kernels defines the size of the
@@ -405,8 +407,9 @@ class Function final : public Named {
   /// cells should be added to the input during convolution. \p group defines
   /// the number of groups the input and output channels should be divided into
   /// and convolved separately.
-  /// NOTE: ChannelwiseQuantizedConvolutionNode does not yet have an
-  /// implementation so attempting to run a graph containing this node fails.
+  /// NOTE: ChannelwiseQuantizedConvolutionNode does
+  /// not yet have an implementation so attempting to run a graph containing
+  /// this node fails.
   ChannelwiseQuantizedConvolutionNode *createChannelwiseQuantizedConv(
       llvm::StringRef name, NodeValue input, Constant *filter, Constant *bias,
       Constant *scales, Constant *offsets, TypeRef outTy,
@@ -419,25 +422,28 @@ class Function final : public Named {
   MaxPoolNode *createMaxPool(llvm::StringRef name, NodeValue input,
                              llvm::ArrayRef<unsigned_t> kernels,
                              llvm::ArrayRef<unsigned_t> strides,
-                             llvm::ArrayRef<unsigned_t> pads);
+                             llvm::ArrayRef<unsigned_t> pads,
+                             ConvolutionLayout layout = NHWC);
 
   MaxPoolNode *createMaxPool(llvm::StringRef name, NodeValue input,
                              unsigned_t kernel, unsigned_t stride,
-                             unsigned_t pad);
+                             unsigned_t pad, ConvolutionLayout layout = NHWC);
 
   AvgPoolNode *createAvgPool(llvm::StringRef name, NodeValue input,
                              llvm::ArrayRef<unsigned_t> kernels,
                              llvm::ArrayRef<unsigned_t> strides,
-                             llvm::ArrayRef<unsigned_t> pads);
+                             llvm::ArrayRef<unsigned_t> pads,
+                             ConvolutionLayout layout = NHWC);
 
   AvgPoolNode *createAvgPool(llvm::StringRef name, NodeValue input,
                              TypeRef outTy, llvm::ArrayRef<unsigned_t> kernels,
                              llvm::ArrayRef<unsigned_t> strides,
-                             llvm::ArrayRef<unsigned_t> pads);
+                             llvm::ArrayRef<unsigned_t> pads,
+                             ConvolutionLayout layout = NHWC);
 
   AvgPoolNode *createAvgPool(llvm::StringRef name, NodeValue input,
                              unsigned_t kernel, unsigned_t stride,
-                             unsigned_t pad);
+                             unsigned_t pad, ConvolutionLayout layout = NHWC);
 
   /// Creates and \returns an AdaptiveAvgPool node with \p name, \p input, and
   /// \p outTy. The AdaptiveAvgPoolNode will perform average pooling over the
@@ -1100,14 +1106,15 @@ class Function final : public Named {
   /// defines the number of groups the input and output channels should be
   /// divided into and convolved separately. \p dilation defines factor by
   /// which gap between 2 neighboring kernel elements is expanded along each
-  /// axis.
+  /// axis. \p layout defines the Tensor layout and must be either NHWC or NCHW.
   ConvolutionNode *createConv(PlaceholderBindings &bindings,
                               llvm::StringRef name, NodeValue input,
                               size_t outChannels,
                               llvm::ArrayRef<unsigned_t> kernels,
                               llvm::ArrayRef<unsigned_t> strides,
                               llvm::ArrayRef<unsigned_t> pads, unsigned_t group,
-                              unsigned_t dilation = 1);
+                              unsigned_t dilation = 1,
+                              ConvolutionLayout layout = NHWC);
 
   /// Creates a ConvolutionNode with the given \p name which convolves the 4D
   /// \p input. \p kernel defines the size of the height and width dimensions of
@@ -1117,12 +1124,13 @@ class Function final : public Named {
   /// defines the number of groups the input and output channels should be
   /// divided into and convolved separately.\p dilation defines factor by
   /// which gap between 2 neighboring kernel elements is expanded along each
-  /// axis.
+  /// axis. \p layout defines the Tensor layout and must be either NHWC or NCHW.
   ConvolutionNode *createConv(PlaceholderBindings &bindings,
                               llvm::StringRef name, NodeValue input,
                               size_t outChannels, unsigned_t kernel,
                               unsigned_t stride, unsigned_t pad,
-                              unsigned_t group, unsigned_t dilation = 1);
+                              unsigned_t group, unsigned_t dilation = 1,
+                              ConvolutionLayout layout = NHWC);
 
   /// Creates a Convolution3DNode with the given \p name which convolves the 5D
   /// \p input. \p kernels defines the size of the height, width, and depth
 
@@ -203,6 +203,9 @@ inline ShapeHWD calculate3DConvPoolOutputDims(
 /// Modes of the padding operation.
 enum PaddingMode { CONSTANT = 0, REFLECT, EDGE };
 
+/// Convolution Layouts.
+enum ConvolutionLayout { NHWC = 0, NCHW };
+
 /// Support for hashing the Nodes. This is required for using
 /// llvm::hash_combine.
 class Node;
 
@@ -52,13 +52,16 @@ class IRBuilder {
   /// @name High-level, operation-level IRBuilder.
   ///@{
 
-  MaxPoolWithArgmaxInst *createMaxPoolWithArgmaxOp(
-      llvm::StringRef name, Value *input, llvm::ArrayRef<unsigned_t> kernels,
-      llvm::ArrayRef<unsigned_t> strides, llvm::ArrayRef<unsigned_t> pads);
+  MaxPoolWithArgmaxInst *
+  createMaxPoolWithArgmaxOp(llvm::StringRef name, Value *input,
+                            llvm::ArrayRef<unsigned_t> kernels,
+                            llvm::ArrayRef<unsigned_t> strides,
+                            llvm::ArrayRef<unsigned_t> pads, unsigned_t layout);
 
   AvgPoolInst *createAvgPoolOp(Value *input, llvm::ArrayRef<unsigned_t> kernels,
                                llvm::ArrayRef<unsigned_t> strides,
-                               llvm::ArrayRef<unsigned_t> pads);
+                               llvm::ArrayRef<unsigned_t> pads,
+                               unsigned_t layout);
 
   CrossEntropyLossInst *createCrossEntropyLossOp(llvm::StringRef name, Value *P,
                                                  Value *labels);
 
@@ -282,6 +282,8 @@ void BoundInterpreterFunction::fwdConvolutionInstQuantizedImpl(
 }
 
 void BoundInterpreterFunction::fwdConvolutionInst(const ConvolutionInst *I) {
+  assert(I->getLayout() == NHWC &&
+         "Glow Interpreter supports only NHWC Convolutions");
   auto kernelSizes = I->getKernels();
   auto pads = I->getPads();
   auto strides = I->getStrides();
@@ -303,6 +305,8 @@ void BoundInterpreterFunction::fwdConvolutionInst(const ConvolutionInst *I) {
 
 void BoundInterpreterFunction::fwdConvolutionGradInst(
     const ConvolutionGradInst *I) {
+  assert(I->getLayout() == NHWC &&
+         "Glow Interpreter supports only NHWC Convolutions");
   auto inW = getWeightHandle(I->getSrc());
   auto inG = getWeightHandle(I->getSrcGrad());
   auto outG = getWeightHandle(I->getDestGrad());
@@ -753,6 +757,7 @@ static void fwdMaxPool(Tensor *inW, Tensor *outW, Tensor *argmaxW,
 }
 
 void BoundInterpreterFunction::fwdMaxPoolInst(const MaxPoolInst *I) {
+  assert(I->getLayout() == NHWC && "Glow Interpreter supports only NHWC Pools");
   auto inW = getTensor(I->getSrc());
   auto outW = getTensor(I->getDest());
 
@@ -770,6 +775,7 @@ void BoundInterpreterFunction::fwdMaxPoolInst(const MaxPoolInst *I) {
 
 void BoundInterpreterFunction::fwdMaxPoolWithArgmaxInst(
     const MaxPoolWithArgmaxInst *I) {
+  assert(I->getLayout() == NHWC && "Glow Interpreter supports only NHWC Pools");
   auto inW = getTensor(I->getSrc());
   auto outW = getTensor(I->getDest());
   auto argmaxW = getTensor(I->getArgmax());
@@ -888,6 +894,7 @@ void BoundInterpreterFunction::fwdAvgPoolInstI8Impl(const AvgPoolInst *I) {
 }
 
 void BoundInterpreterFunction::fwdAvgPoolInst(const AvgPoolInst *I) {
+  assert(I->getLayout() == NHWC && "Glow Interpreter supports only NHWC Pools");
   if (I->getSrc()->getType()->isQuantizedType()) {
     fwdAvgPoolInstI8Impl(I);
     return;