pytorch
diff --git a/‎.ci/bundle_instrument_expected_output.txt
Lines changed: 2 additions & 2 deletions b/‎.ci/bundle_instrument_expected_output.txt
Lines changed: 2 additions & 2 deletions
diff --git a/‎.circleci/config.yml
Lines changed: 2 additions & 2 deletions b/‎.circleci/config.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 2 additions & 1 deletion b/‎README.md
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/glow/Backends/Interpreter/InterpreterFunction.h
Lines changed: 14 additions & 0 deletions b/‎include/glow/Backends/Interpreter/InterpreterFunction.h
Lines changed: 14 additions & 0 deletions
diff --git a/‎include/glow/Graph/Graph.h
Lines changed: 39 additions & 0 deletions b/‎include/glow/Graph/Graph.h
Lines changed: 39 additions & 0 deletions
diff --git a/‎include/glow/Graph/VerifierHelper.h
Lines changed: 13 additions & 0 deletions b/‎include/glow/Graph/VerifierHelper.h
Lines changed: 13 additions & 0 deletions
diff --git a/‎include/glow/Importer/CommonOperatorLoader.h
Lines changed: 0 additions & 53 deletions b/‎include/glow/Importer/CommonOperatorLoader.h
Lines changed: 0 additions & 53 deletions
diff --git a/‎include/glow/Importer/ONNXModelLoader.h
Lines changed: 4 additions & 0 deletions b/‎include/glow/Importer/ONNXModelLoader.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/glow/LLVMIRCodeGen/BundleSaver.h
Lines changed: 2 additions & 0 deletions b/‎include/glow/LLVMIRCodeGen/BundleSaver.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/glow/Optimizer/GraphOptimizer/GraphOptimizer.h
Lines changed: 13 additions & 1 deletion b/‎include/glow/Optimizer/GraphOptimizer/GraphOptimizer.h
Lines changed: 13 additions & 1 deletion
diff --git a/‎include/glow/Support/Support.h
Lines changed: 6 additions & 0 deletions b/‎include/glow/Support/Support.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎lib/Backends/CPU/CPUBackend.cpp
Lines changed: 1 addition & 0 deletions b/‎lib/Backends/CPU/CPUBackend.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Backends/CPU/tests/CPUOperatorTest.cpp
Lines changed: 8 additions & 0 deletions b/‎lib/Backends/CPU/tests/CPUOperatorTest.cpp
Lines changed: 8 additions & 0 deletions
diff --git a/‎lib/Backends/Habana/tests/HabanaOperatorTest.cpp
Lines changed: 5 additions & 0 deletions b/‎lib/Backends/Habana/tests/HabanaOperatorTest.cpp
Lines changed: 5 additions & 0 deletions
@@ -1,4 +1,4 @@
-Number of instructions: 10
-Number of data dumps: 28
+Number of instructions: 8
+Number of data dumps: 24
 Result: 0
 Confidence: 0.991618
@@ -32,7 +32,7 @@ update_submodule: &update_submodule
 linux_default: &linux_default
   resource_class: large
   machine:
-    image: default
+    image: ubuntu-2004:202010-01
   steps:
   - checkout
   - run:
@@ -46,7 +46,7 @@ linux_default: &linux_default
         set -e
         export AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_FOR_ECR_READ_WRITE}
         export AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY_FOR_ECR_READ_WRITE}
-        sudo pip install awscli==1.16.35 -qqq
+        pip install awscli==1.16.35 -qqq
         eval $(aws ecr get-login --region us-east-1 --no-include-email)
 
         docker pull ${DOCKER_IMAGE}
 
@@ -146,7 +146,8 @@ following command should install the required dependencies:
       libprotobuf-dev llvm-8 llvm-8-dev ninja-build protobuf-compiler wget \
       opencl-headers libgoogle-glog-dev libboost-all-dev \
       libdouble-conversion-dev libevent-dev libssl-dev libgflags-dev \
-      libjemalloc-dev libpthread-stubs0-dev
+      libjemalloc-dev libpthread-stubs0-dev liblz4-dev libzstd-dev libbz2-dev \
+      libsodium-dev libfmt-dev
   ```
 
 [Note: Ubuntu 16.04 and 18.04 ship with llvm-6 and need to be upgraded before building Glow. Building Glow on Ubuntu 16.04 with llvm-7 fails because llvm-7 xenial distribution uses an older c++ ABI, however building Glow on Ubuntu 18.04 with llvm-7 has been tested and is successful]
 
@@ -211,6 +211,17 @@ class BoundInterpreterFunction : public IRInstructionProcessingHandler {
   template <typename ElemTy>
   void fwdFullyConnectedInstFloatImpl(const FullyConnectedInst *I);
 
+  template <typename ElemTy, typename OutputTy, typename AccumulatorTy>
+  void fwdDynRowwiseQuantizedFullyConnectedInstImpl(
+      Handle<ElemTy> inW, Handle<OutputTy> &outW, dim_t baseRow,
+      Handle<ElemTy> weightsW, Handle<float> biasW, Handle<float> scalesW,
+      Handle<int32_t> offsetsW);
+
+  void fwdDynRowwiseQuantizedFullyConnectedInstPreimpl(
+      Tensor *inputTensor, Tensor *weightsTensor, Tensor *biasTensor,
+      Tensor *resultTensor, Tensor *wScaleTensor, Tensor *wOffsetTensor,
+      bool isSymmetric, bool isPerBatchElement);
+
   template <typename ElemTy, typename AccumulatorTy,
             typename BiasElemTy = int32_t>
   void fwdRowwiseQuantizedFullyConnectedInstImpl(Value *inV, Value *outV,
@@ -263,6 +274,9 @@ class BoundInterpreterFunction : public IRInstructionProcessingHandler {
 
   template <typename ElemTy> void fwdTanhInstFloatImpl(const TanhInst *I);
 
+  template <typename ElemTy>
+  void fwdSoftPlusInstFloatImpl(const SoftPlusInst *I);
+
   template <typename ElemTy>
   void fwdCrossEntropyLossInstFloatImpl(const CrossEntropyLossInst *I);
 
 
@@ -644,6 +644,33 @@ class Function final : public IRContainer {
                        float beta = 1.0, bool transposeA = false,
                        bool transposeB = false);
 
+  /// Create and \returns a DynamicQuantizedFullyConnectedNode with \p name,
+  /// \p input, weights \p W, bias \p B, flag to indicate mode \p isSymmetric.
+  /// By default it is a dynamic quantized FC node, which takes fp16 inputs,
+  /// symmetrically quantized them, run FC on them, dequantize them and produces
+  /// fp16 output. If \p isSymmetric is set to false, then inputs are
+  /// asymmetrically quantized. if \p isPerBatchElement is set to false, then
+  /// inputs are per tensor quantized.
+  DynamicQuantizedFullyConnectedNode *createDynamicQuantizedFullyConnected(
+      llvm::StringRef name, NodeValue input, NodeValue W, NodeValue B,
+      bool isSymmetric = true, bool isPerBatchElement = true);
+
+  /// Create and \returns a DynamicRowwiseQuantizedFullyConnectedNode with \p
+  /// name, \p input, weights \p W, bias \p B, rowwise weight qparams \p wScale
+  /// and \p wOffset, flag to indicate mode \p isSymmetric. By default it is a
+  /// dynamic quantized FC node, which takes fp16 inputs, symmetrically
+  /// quantized them, run FC on them, dequantize them and produces fp16 output.
+  /// If \p isSymmetric is set to false, then inputs are asymmetrically
+  /// quantized. if \p isPerBatchElement is set to false, then inputs are per
+  /// tensor quantized.
+  DynamicRowwiseQuantizedFullyConnectedNode *
+  createDynamicRowwiseQuantizedFullyConnected(llvm::StringRef name,
+                                              NodeValue input, NodeValue W,
+                                              NodeValue B, NodeValue wScale,
+                                              NodeValue wOffset,
+                                              bool isSymmetric = true,
+                                              bool isPerBatchElement = true);
+
   /// Creates and \returns a FullyConnectedNode with \p name, \p input, weights
   /// \p W, bias \p B. If \p input is not 2 dimensional then it is flattened
   /// along \p axis. Note, output type and outputDepth are inferred based on
@@ -780,6 +807,11 @@ class Function final : public IRContainer {
   /// \returns a LogitNode with \p name given \p input and \p eps.
   LogitNode *createLogit(llvm::StringRef name, NodeValue input, float eps);
 
+  /// Create a SoftPlus node with the given \p name, \p input and
+  /// output type \p outTy.
+  SoftPlusNode *createSoftPlus(llvm::StringRef name, NodeValue input,
+                               TypeRef outTy = nullptr);
+
   SoftMaxNode *createSoftMax(llvm::StringRef name, NodeValue input,
                              NodeValue selected, TypeRef outTy = nullptr,
                              float beta = 1.0);
@@ -1488,6 +1520,11 @@ class Function final : public IRContainer {
                           NodeValue values, NodeValue defaultValue,
                           NodeValue lengths, llvm::ArrayRef<dim_t> mask);
 
+  // TODO: add description
+  SparseLabelSplitNode *
+  createSparseLabelSplit(llvm::StringRef name, NodeValue lengths,
+                         NodeValue indices, NodeValue values, dim_t numLabels);
+
   SaveNode *createSave(llvm::StringRef name, NodeValue input);
 
   /// Creates and \returns a SaveNode of \p input to \p output. If \p skipSuffix
@@ -2388,6 +2425,8 @@ bool isOutput(const Placeholder *PH, const Function &F);
 bool isInput(const Placeholder *PH, const Function &F);
 
 /// Helper vectors for common transpose shuffles.
+#define NCH2NHC                                                                \
+  { 0u, 2u, 1u }
 #define NCHW2NHWC                                                              \
   { 0u, 2u, 3u, 1u }
 #define NCTHW2NTHWC                                                            \
 
@@ -198,12 +198,25 @@ bool checkSameShape(NodeValue A, NodeValue B, const Node *parent);
 /// \see expectCompareTrue for more details.
 bool checkType(NodeValue A, ElemKind expectedType, const Node *parent);
 
+/// Check that the element type of the operand \p A matches expected type \p
+/// expectedType. \p parent is used to print the context of that check
+/// in case the it fails.
+/// \see expectCompareTrue for more details.
+bool checkType(llvm::StringRef msg, NodeValue A, ElemKind expectedType,
+               const Node *parent);
+
 /// Check that the element type of the operand \p A matches any of the expected
 /// types \p expectedTypes. \p parent is used to print the context of that
 /// check in case the it fails. \see expectCompareTrue for more details.
 bool checkType(NodeValue A, llvm::ArrayRef<ElemKind> expectedTypes,
                const Node *parent);
 
+/// Check that the element type of the operand \p A matches any of the expected
+/// types \p expectedTypes. \p parent is used to print the context of that
+/// check in case the it fails. \see expectCompareTrue for more details.
+bool checkType(llvm::StringRef msg, NodeValue A,
+               llvm::ArrayRef<ElemKind> expectedTypes, const Node *parent);
+
 /// Check if \p A and \p B have the same value for isQuantized. \p parent is
 /// used to print the context of that check in case the it fails.
 /// \see expectCompareTrue for more details.
 
@@ -997,55 +997,6 @@ class CommonOperatorLoader : public ProtobufLoader {
     return Error::success();
   }
 
-  Error loadTopK(const OpType &op, ArgumentDictionaryTy &dict) {
-    const std::string &opName = loadOperatorName(op);
-    NodeValue in;
-    ASSIGN_VALUE_OR_RETURN_ERR(in, getNodeValueByName(op.input(0)));
-    RETURN_ERR_IF_NOT(
-        op.input_size() <= 2,
-        opErrMsg(
-            op,
-            strFormat(
-                "TopK: Maximum number of inputs is 2, but found input size %d ",
-                op.input_size())));
-    unsigned_t k = 0;
-    if (op.input_size() > 1) {
-      Constant *kConst = getConstantByNameOrNull(op.input(1));
-      RETURN_ERR_IF_NOT(
-          kConst,
-          opErrMsg(op, "TopK: Non-constant k is not supported by Glow."));
-      RETURN_ERR_IF_NOT(
-          kConst->getElementType() == ElemKind::Int64ITy,
-          opErrMsg(op, strFormat(
-                           "TopK: k input must be of type Int64, but found "
-                           "input type '%s' ",
-                           kConst->getType()->getElementName().str().c_str())));
-      auto constH = kConst->getPayload().getHandle<int64_t>();
-      k = constH.at({0});
-    } else {
-      ASSIGN_VALUE_OR_RETURN_ERR(k, loadInt(dict["k"]));
-    }
-
-    int lastDim = in.dims().size() - 1;
-    int axis = lastDim;
-    if (dict.count("axis")) {
-      ASSIGN_VALUE_OR_RETURN_ERR(axis,
-                                 loadAxis<int>(dict["axis"], in.dims().size()));
-    }
-
-    RETURN_ERR_IF_NOT(
-        axis == lastDim,
-        opErrMsg(
-            op,
-            strFormat(
-                "TopK: Currently only support axis %d being last dimension %d ",
-                axis, lastDim)));
-
-    auto *R = G_->createTopK(opName, in, k);
-    RETURN_IF_ERR(addNodeAsOutput(op, R));
-    return Error::success();
-  }
-
   Error loadReduceOp(llvm::StringRef typeName, const OpType &op,
                      ArgumentDictionaryTy &dict) {
     const std::string &opName = loadOperatorName(op);
@@ -1624,10 +1575,6 @@ class CommonOperatorLoader : public ProtobufLoader {
       RETURN_IF_ERR(loadIdentity(op, dict));
       return true;
     }
-    if (typeName == "TopK") {
-      RETURN_IF_ERR(loadTopK(op, dict));
-      return true;
-    }
     if (typeName == "ReduceMean" || typeName == "ReduceSum" ||
         typeName == "ReduceMin" || typeName == "ReduceMax" ||
         typeName == "ReduceProd") {
 
@@ -187,6 +187,10 @@ class ONNXModelLoader
   Error loadScatterData(const ONNX_NAMESPACE::NodeProto &op,
                         const ArgumentDictionaryTy &dict);
 
+  /// Load TopK ONNX operator
+  Error loadTopK(const ONNX_NAMESPACE::NodeProto &op,
+                 ArgumentDictionaryTy &dict);
+
   /// Load Conv ONNX operator.
   Error loadConv(const ONNX_NAMESPACE::NodeProto &op,
                  ArgumentDictionaryTy &dict);
 
@@ -92,6 +92,8 @@ class BundleSaver {
   /// \returns the weight that the variable \p v is lowered into in one of the
   /// IR functions inside this bundle, or null if the variable is unknown.
   virtual Value *getWeightForNode(const Storage *V) const;
+  /// \returns LLVMIRGen used by the bundle saver.
+  virtual LLVMIRGen *getLLVMIRGen();
   /// Information about allocations.
   AllocationsInfo allocationsInfo_;
   /// The LLVM IR code generator.
 
@@ -140,7 +140,19 @@ bool executeVerticalFCWeightsSplit(Function *F, unsigned numOfChunks,
 
 /// Represents what kind of parallelization transformation should be performed
 /// by \ref parallelizeOps().
-enum class ParallelTransformKind { None, Data, Model };
+/// \p Data indicates splitting along batch axis (dim = 0)
+/// \p Model indicates splitting along dim = 1
+/// \p Model[n] where \p n is in \p [1-5] indicates splitting along dim = \p n
+enum class ParallelTransformKind {
+  None,
+  Data,
+  Model,
+  Model_Axis1,
+  Model_Axis2,
+  Model_Axis3,
+  Model_Axis4,
+  Model_Axis5
+};
 
 /// A specialized ScopeGuard which prevents constant modification from occuring
 /// by swappiing in temporary Placeholders in place of Constants during the
 
@@ -65,6 +65,12 @@ Stream &operator<<(Stream &os, const llvm::ArrayRef<E> list) {
   return os;
 }
 
+/// \returns a string obtained from the input string \p str by adding a
+/// delimiter string \p delimiter after each block of \p length characters.
+/// After the last block no delimiter is added.
+std::string separateString(const std::string &str, size_t length,
+                           const std::string &delimiter = "\n");
+
 /// \returns the escaped content of string \p str.
 /// The char '\n' becomes '\'+'n' and quotes are handled correctly.
 std::string escapeDottyString(const std::string &str);
 
@@ -71,6 +71,7 @@ bool CPUBackend::shouldLower(const Node *N) const {
   case Kinded::Kind::ReluNodeKind:
   case Kinded::Kind::ClipNodeKind:
   case Kinded::Kind::LeakyReluNodeKind:
+  case Kinded::Kind::FullyConnectedNodeKind:
   case Kinded::Kind::ConvolutionNodeKind:
   case Kinded::Kind::SparseLengthsSumNodeKind:
     return false;
 
@@ -271,8 +271,10 @@ std::set<std::string> glow::backendTestBlacklist = {
     "NoFusedConvert_FP32Accum/0",
     "SLWSTwoColumn_Float16_AccumFloat/0",
     "SparseToDense_Int64/0",
+    "SparseToDense_Float16_Int32/0",
     "SparseToDenseMask1/0",
     "SparseToDenseMask2/0",
+    "SparseLabelSplit/0",
     "BoolReshape/0",
     "BFloat16Reshape/0",
     "FP16Reshape/0",
@@ -281,6 +283,9 @@ std::set<std::string> glow::backendTestBlacklist = {
     "Flatten_BFloat16Ty/0",
     "Flatten_Float16Ty/0",
     "Bucketize/0",
+    "SoftPlus_Float/0",
+    "SoftPlus_BFloat16/0",
+    "SoftPlus_Float16/0",
     "BFloat16SoftMax/0",
     "FP16SoftMax/0",
     "BatchOneHotDataBFloat16/0",
@@ -402,6 +407,9 @@ std::set<std::string> glow::backendTestBlacklist = {
     "Int8BatchNorm3D/0",
     "LayerNorm_Float16/0",
     "LayerNorm_Int8_With_Float_Scale_Bias/0",
+    "DynamicQuantizedFullyConnectedBasic/0",
+    "DynamicQuantizedFullyConnectedStrongWeights/0",
+    "DynamicRowwiseQuantizedFullyConnectedBasic/0",
     "LSTMUnitFP16/0",
     "PyTorchLSTMFP16/0",
     "ChannelwiseQuantizedConv2D_NonZero_FloatBias/0",
 
@@ -281,6 +281,9 @@ std::set<std::string> glow::backendTestBlacklist = {
     "Logit_Float16/0",
     "LSTMUnitFP16/0",
     "PyTorchLSTMFP16/0",
+    "DynamicQuantizedFullyConnectedBasic/0",
+    "DynamicQuantizedFullyConnectedStrongWeights/0",
+    "DynamicRowwiseQuantizedFullyConnectedBasic/0",
     "matmulQuantized_InterpCompareParClone/0",
     "MaxPool/0",
     "ModuloInt32NoSignFollow/0",
@@ -318,6 +321,7 @@ std::set<std::string> glow::backendTestBlacklist = {
     "pow/0",
     "PReluSimple_Float/0",
     "PReluSimple_Float16/0",
+    "PRelu_Int8/0",
     "QuantizedArgMaxKeepDim/0",
     "QuantizedArgMaxNoKeepDim/0",
     "QuantizedArithmeticRescaled/0",
@@ -390,6 +394,7 @@ std::set<std::string> glow::backendTestBlacklist = {
     "SparseToDense_Int64/0",
     "SparseToDenseMask1/0",
     "SparseToDenseMask2/0",
+    "SparseLabelSplit/0",
     "Split_Float16/0",
     "SqueezeExpand/0",
     "Tanh/0",