pytorch
diff --git a/‎docs/Quantization.md
Lines changed: 13 additions & 0 deletions b/‎docs/Quantization.md
Lines changed: 13 additions & 0 deletions
diff --git a/‎include/glow/Base/Tensor.h
Lines changed: 48 additions & 4 deletions b/‎include/glow/Base/Tensor.h
Lines changed: 48 additions & 4 deletions
diff --git a/‎include/glow/Base/Type.h
Lines changed: 15 additions & 9 deletions b/‎include/glow/Base/Type.h
Lines changed: 15 additions & 9 deletions
diff --git a/‎include/glow/Graph/Graph.h
Lines changed: 54 additions & 14 deletions b/‎include/glow/Graph/Graph.h
Lines changed: 54 additions & 14 deletions
diff --git a/‎include/glow/Quantization/Base/Base.h
Lines changed: 9 additions & 0 deletions b/‎include/glow/Quantization/Base/Base.h
Lines changed: 9 additions & 0 deletions
diff --git a/‎lib/Backends/CPU/LLVMIRGen.cpp
Lines changed: 30 additions & 0 deletions b/‎lib/Backends/CPU/LLVMIRGen.cpp
Lines changed: 30 additions & 0 deletions
diff --git a/‎lib/Backends/CPU/libjit/libjit.cpp
Lines changed: 24 additions & 0 deletions b/‎lib/Backends/CPU/libjit/libjit.cpp
Lines changed: 24 additions & 0 deletions
@@ -212,3 +212,16 @@ Row-wise quantized SparseLengthsWeightedSum is also supported. Similar to the
 above, we compute scales and offsets per row, to be used with the `Data` input
 for the `RowwiseQuantizedSparseLengthsSumNode`. Scales and Offsets are inputs to
 the node. Output of this node is float, matching the Caffe2 implementation.
+
+### Fused Row-wise Quantization
+
+For some backends it may be beneficial to keep each row's scales and offsets
+fused inline with the data. Caffe2 implements nodes with fused storage, such as
+[SparseLengthsWeightedSum](https://caffe2.ai/docs/operators-catalogue.html#sparselengthsweightedsumfused8bitrowwise). Glow
+supports such fused Nodes/Instructions, for example
+`FusedRowwiseQuantizedSparseLengthsWeightedSum`. The `ElemKind` of fused tensors
+is `Int8FusedQTy`. Tensors with `Int8FusedQTy` are 2-dimensional, and have an
+extra 8 columns for each row. The first extra 4 bytes are the float scale of the
+row, and the second extra 4 bytes are the in32_t offset. Note that similar to
+normal row-wise quantized tensors, they use a dummy scale and offset in the
+Type.
@@ -107,6 +107,18 @@ class Tensor final {
       auto *data = reinterpret_cast<int32_t *>(getData());
       std::fill(&data[0], &data[0] + size(), (int32_t)type_.getOffset());
     } break;
+    case ElemKind::Int8FusedQTy: {
+      assert(dims().size() == 2 && "Fused tensor must be 2-dimensional.");
+      assert(dims()[1] > 8 && "Fused tensor must have more than 8 columns.");
+      const size_t width = dims()[1];
+      auto *data = reinterpret_cast<int8_t *>(getData());
+      for (size_t i = 0, e = dims()[0]; i < e; i++) {
+        int8_t *scaleOffsetPtr = &data[(i + 1) * width] - 8;
+        int32_t offset;
+        memcpy(&offset, scaleOffsetPtr + 4, 4);
+        std::fill(&data[i * width], scaleOffsetPtr, (int8_t)offset);
+      }
+    } break;
     default:
       // Non-quantized tensors are set to 0.
       std::fill(&getData()[0], &getData()[0] + size() * type_.getElementSize(),
@@ -174,8 +186,9 @@ class Tensor final {
   Tensor &operator=(const Tensor &other) = delete;
 
   /// Initialize the content of the tensor using the \p init method. The value
-  /// \p val is the initialization parameter. \p PRNG is used to generate
-  /// random numbers.
+  /// \p val is the initialization parameter. \p PRNG is used to generate random
+  /// numbers. Note that if the tensor's kind is Int8FusedQTy, then the fused
+  /// scaled/offsets will not be modified.
   void init(InitKind init, float val, PseudoRNG &PRNG);
 
   /// \returns unowned tensor using the same data buffer as the current tensor
@@ -288,6 +301,17 @@ class Tensor final {
       return false;
     }
 
+    // For now, make sure that either both or neither of the tensors have
+    // Int8FusedQTy. While it is possible for an Int8QTy tensor to equal a
+    // Int8FusedQTy tensor if the Int8FusedQTy tensor has the same scale/offset
+    // on all of its rows, and that scale/offset match that of the Int8QTy, we
+    // do not support checking this for now.
+    assert(((getElementType() == ElemKind::Int8FusedQTy &&
+             other.getElementType() == ElemKind::Int8FusedQTy) ||
+            (getElementType() != ElemKind::Int8FusedQTy &&
+             other.getElementType() != ElemKind::Int8FusedQTy)) &&
+           "Int8FusedQTy only supports comparing against same ElemKind.");
+
     switch (getElementType()) {
     case ElemKind::FloatTy:
       return isEqualImpl<float>(other, allowedError);
@@ -315,6 +339,11 @@ class Tensor final {
       return isEqualImpl<int32_t>(other, allowedError);
     case ElemKind::Int64ITy:
       return isEqualImpl<int64_t>(other, allowedError);
+      // Note: We can use isEqualImpl() here because the scales/offsets will be
+      // compared as if they were data, so we will return false if any rowwise
+      // scale/offset do not match.
+    case ElemKind::Int8FusedQTy:
+      return isEqualImpl<int8_t>(other, allowedError);
     }
 
     // This is to make compiler happy. It can never reach this point as switch
@@ -701,8 +730,23 @@ template <class ElemTy> class Handle final {
     assert(filterSize > 0 && "invalid filter size");
     double scale = std::sqrt(3.0 / double(filterSize));
     std::uniform_real_distribution<> dist(-scale, scale);
-    for (auto &e : *this) {
-      e = dist(PRNG);
+    switch (getElementType()) {
+    default: {
+      for (auto &e : *this) {
+        e = dist(PRNG);
+      }
+      return;
+    }
+    case ElemKind::Int8FusedQTy: {
+      assert(dims().size() == 2 && "Fused tensor must be 2-dimensional.");
+      assert(dims()[1] > 8 && "Fused tensor must have more than 8 columns.");
+      for (size_t i = 0, e = dims()[0]; i < e; i++) {
+        for (size_t j = 0, f = dims()[1] - 8; j < f; j++) {
+          at({i, j}) = dist(PRNG);
+        }
+      }
+      return;
+    }
     }
   }
 
 
@@ -185,13 +185,14 @@ inline bool operator==(const ShapeNCHW &LHS, const ShapeNCHW &RHS) {
 /// An enum representing the type used by the elements of a tensor. The types of
 /// Handles for these tensors should match the element kind.
 enum class ElemKind : unsigned char {
-  FloatTy,   // 32-bit float type (float)
-  Float16Ty, // 16-bit float type (half, fp16)
-  Int8QTy,   // 8-bit quantized type (int8_t)
-  Int16QTy,  // 16-bit quantized type (int16_t)
-  Int32QTy,  // 32-bit quantized type (int32_t)
-  Int32ITy,  // 32-bit index type (int32_t)
-  Int64ITy,  // 64-bit index type (int64_t)
+  FloatTy,      // 32-bit float type (float)
+  Float16Ty,    // 16-bit float type (half, fp16)
+  Int8QTy,      // 8-bit quantized type (int8_t)
+  Int16QTy,     // 16-bit quantized type (int16_t)
+  Int32QTy,     // 32-bit quantized type (int32_t)
+  Int32ITy,     // 32-bit index type (int32_t)
+  Int64ITy,     // 64-bit index type (int64_t)
+  Int8FusedQTy, // 8-bit quantized type with fused scale/offset (int8_t)
 };
 
 /// A class that represents a type of a tensor.
@@ -360,6 +361,8 @@ struct Type final {
       return std::is_same<ElemTy, int32_t>::value;
     case ElemKind::Int64ITy:
       return std::is_same<ElemTy, int64_t>::value;
+    case ElemKind::Int8FusedQTy:
+      return std::is_same<ElemTy, int8_t>::value;
     }
     GLOW_UNREACHABLE("Invalid type.");
   }
@@ -368,7 +371,8 @@ struct Type final {
   bool isQuantizedType() const {
     return elementType_ == ElemKind::Int8QTy ||
            elementType_ == ElemKind::Int16QTy ||
-           elementType_ == ElemKind::Int32QTy;
+           elementType_ == ElemKind::Int32QTy ||
+           elementType_ == ElemKind::Int8FusedQTy;
   }
 
   /// \returns true if the type of this Tensor is one of the floating point
@@ -401,6 +405,8 @@ struct Type final {
       return sizeof(int32_t);
     case ElemKind::Int64ITy:
       return sizeof(int64_t);
+    case ElemKind::Int8FusedQTy:
+      return sizeof(int8_t);
     }
     GLOW_UNREACHABLE("Invalid type.");
   }
@@ -413,7 +419,7 @@ struct Type final {
   /// \return the textual name of the element \p Ty.
   static llvm::StringRef getElementName(ElemKind Ty) {
     static const char *names[] = {
-        "float", "float16", "i8", "i16", "i32", "index32", "index64",
+        "float", "float16", "i8", "i16", "i32", "index32", "index64", "i8fused",
     };
     return names[(int)Ty];
   }
 
@@ -573,18 +573,19 @@ class Function final : public Named {
                                  NodeValue data, NodeValue weights,
                                  NodeValue indices, NodeValue lengths);
 
-  /// Create a node, performing SparseLengthsSum operation, using rowwise
-  /// quantization for the input data. Gathers slices of the outer-most
-  /// dimension of Data indexed by Indices vector, and then accumulates them
-  /// into len(Lengths) entries: first Lengths[0] slices are aggregated to
-  /// Result[0], next Lengths[1] slices are aggregated to Result[1],
-  /// etc. I.e. sum(Lengths) must be equal to len(Indices).
+  /// Creates and \returns a node of \p name, performing the SparseLengthsSum
+  /// operation, using rowwise quantization for the input \p data with the \p
+  /// scales and \p offsets as separate input tensors. Gathers slices of the
+  /// outer-most dimension of data indexed by the \p indices vector, and then
+  /// accumulates them into len(\p lengths) entries: first Lengths[0] slices are
+  /// aggregated to Result[0], next Lengths[1] slices are aggregated to
+  /// Result[1], etc. I.e. sum(Lengths) must be equal to len(Indices).
   RowwiseQuantizedSparseLengthsWeightedSumNode *
-  createRowwiseQuantizedSparseLengthsWeightedSum(
-      llvm::StringRef name, Constant *data, Constant *scales, Constant *offsets,
-      NodeValue weights, NodeValue indices, NodeValue lengths);
+  createRowwiseQuantizedSparseLengthsSum(llvm::StringRef name, Constant *data,
+                                         Constant *scales, Constant *offsets,
+                                         NodeValue indices, NodeValue lengths);
 
-  /// Same as \ref createRowwiseQuantizedSparseLengthsWeightedSum(), but expects
+  /// Same as \ref createRowwiseQuantizedSparseLengthsSum(), but expects
   /// float input \p data, which is rowwise-quantized internally.
   RowwiseQuantizedSparseLengthsWeightedSumNode *
   createRowwiseQuantizedSparseLengthsSum(llvm::StringRef name, Tensor &data,
@@ -593,11 +594,11 @@ class Function final : public Named {
   /// Same as \ref createRowwiseQuantizedSparseLengthsSum(), but i-th slice is
   /// multiplied by weights[i]. len(weights) must be equal to len(indices).
   RowwiseQuantizedSparseLengthsWeightedSumNode *
-  createRowwiseQuantizedSparseLengthsSum(llvm::StringRef name, Constant *data,
-                                         Constant *scales, Constant *offsets,
-                                         NodeValue indices, NodeValue lengths);
+  createRowwiseQuantizedSparseLengthsWeightedSum(
+      llvm::StringRef name, Constant *data, Constant *scales, Constant *offsets,
+      NodeValue weights, NodeValue indices, NodeValue lengths);
 
-  /// Same as \ref createRowwiseQuantizedSparseLengthsSum(), but expects
+  /// Same as \ref createRowwiseQuantizedSparseLengthsWeightedSum(), but expects
   /// float input \p data, which is rowwise-quantized internally.
   RowwiseQuantizedSparseLengthsWeightedSumNode *
   createRowwiseQuantizedSparseLengthsWeightedSum(llvm::StringRef name,
@@ -606,6 +607,45 @@ class Function final : public Named {
                                                  NodeValue indices,
                                                  NodeValue lengths);
 
+  /// Creates and \returns a node of \p name, performing the SparseLengthsSum
+  /// operation, using fused rowwise quantization for the input \p data wherein
+  /// the scales and offsets are fused inline with each row of data. \p data
+  /// must be ElemKind::Int8FusedQTy. Gathers slices of the outer-most dimension
+  /// of data indexed by the \p indices vector, and then accumulates them into
+  /// len(\p lengths) entries: first Lengths[0] slices are aggregated to
+  /// Result[0], next Lengths[1] slices are aggregated to Result[1],
+  /// etc. I.e. sum(Lengths) must be equal to len(Indices).
+  FusedRowwiseQuantizedSparseLengthsWeightedSumNode *
+  createFusedRowwiseQuantizedSparseLengthsSum(llvm::StringRef name,
+                                              Constant *data, NodeValue indices,
+                                              NodeValue lengths);
+
+  /// Same as \ref createFusedRowwiseQuantizedSparseLengthsSum(), but expects
+  /// float input \p data, which is rowwise-quantized and fused internally.
+  FusedRowwiseQuantizedSparseLengthsWeightedSumNode *
+  createFusedRowwiseQuantizedSparseLengthsSum(llvm::StringRef name,
+                                              Tensor &data, NodeValue indices,
+                                              NodeValue lengths);
+
+  /// Same as \ref createFusedRowwiseQuantizedSparseLengthsSum(), but i-th slice
+  /// is multiplied by weights[i]. len(weights) must be equal to len(indices).
+  FusedRowwiseQuantizedSparseLengthsWeightedSumNode *
+  createFusedRowwiseQuantizedSparseLengthsWeightedSum(llvm::StringRef name,
+                                                      Tensor &data,
+                                                      NodeValue weights,
+                                                      NodeValue indices,
+                                                      NodeValue lengths);
+
+  /// Same as \ref createFusedRowwiseQuantizedSparseLengthsWeightedSum(), but
+  /// expects float input \p data, which is rowwise-quantized and fused
+  /// internally.
+  FusedRowwiseQuantizedSparseLengthsWeightedSumNode *
+  createFusedRowwiseQuantizedSparseLengthsWeightedSum(llvm::StringRef name,
+                                                      Constant *data,
+                                                      NodeValue weights,
+                                                      NodeValue indices,
+                                                      NodeValue lengths);
+
   /// Given a vector of segment lengths, calculates offsets of each segment and
   /// packs them next to the lengths. For the input vector of length N the
   /// output is a Nx2 matrix with (offset, lengths) packaged for each segment.
 
@@ -136,6 +136,15 @@ std::vector<int8_t> createMapping(TypeRef inTy, TypeRef outTy,
 void tensorRowwiseQuantization(const Tensor &input, Tensor &output,
                                Tensor &scales, Tensor &offsets);
 
+/// Fused-rowwise quantize the tensor \p input. Scales and offsets are generated
+/// from each row of \p input. \p output is tensor of the same shape as input
+/// but with 8 extra columns for storing fused scales (4 bytes (columns) for
+/// float) and offset (4 bytes (columns) for int32_t).
+/// \pre input.dims().size() == 2
+/// \pre output.dims().size() == 2
+/// \pre input.dims()[1] + 8 == output.dims()[1]
+void tensorFusedRowwiseQuantization(const Tensor &input, Tensor &output);
+
 } // namespace quantization
 } // namespace glow
 
 
@@ -237,6 +237,8 @@ llvm::Type *LLVMIRGen::getElementType(llvm::IRBuilder<> &builder,
     return builder.getInt32Ty();
   case ElemKind::Int32ITy:
     return builder.getInt32Ty();
+  case ElemKind::Int8FusedQTy:
+    return builder.getInt8Ty();
   }
   return nullptr;
 }
@@ -324,6 +326,9 @@ llvm::Value *LLVMIRGen::emitValueAddress(llvm::IRBuilder<> &builder,
   case ElemKind::Int32ITy:
     T = llvm::Type::getInt32PtrTy(ctx_);
     break;
+  case ElemKind::Int8FusedQTy:
+    T = llvm::Type::getInt8PtrTy(ctx_);
+    break;
   default:
     llvm_unreachable("Unimplemented");
     break;
@@ -469,6 +474,8 @@ llvm::Value *LLVMIRGen::emitConst(llvm::IRBuilder<> &builder, float val,
     return builder.getInt32(static_cast<int32_t>(val));
   case ElemKind::Int32ITy:
     return builder.getInt32(static_cast<int32_t>(val));
+  case ElemKind::Int8FusedQTy:
+    return builder.getInt8(static_cast<int8_t>(val));
   }
   llvm_unreachable("Unknown element type");
 }
@@ -2318,6 +2325,29 @@ void LLVMIRGen::generateLLVMIRForInstr(llvm::IRBuilder<> &builder,
     break;
   }
 
+  case Kinded::Kind::FusedRowwiseQuantizedSparseLengthsWeightedSumInstKind: {
+    auto *N = cast<FusedRowwiseQuantizedSparseLengthsWeightedSumInst>(I);
+    auto *dest = N->getDest();
+    auto *data = N->getData();
+    auto *weights = N->getWeights();
+    auto *indices = N->getIndices();
+    auto *lengths = N->getLengths();
+    auto *destPtr = emitValueAddress(builder, dest);
+    auto *dataPtr = emitValueAddress(builder, data);
+    auto *weightsPtr = emitValueAddress(builder, weights);
+    auto *indicesPtr = emitValueAddress(builder, indices);
+    auto *lengthsPtr = emitValueAddress(builder, lengths);
+    auto *segments = emitConstSizeT(builder, lengths->dims()[0]);
+    auto *inLineSize = emitConstSizeT(builder, data->size() / data->dims()[0]);
+    auto *outLineSize = emitConstSizeT(builder, dest->size() / dest->dims()[0]);
+    auto *F = getFunction("fused_rowwise_quantized_sparse_lengths_weighted_sum",
+                          dest->getElementType());
+    createCall(builder, F,
+               {destPtr, dataPtr, weightsPtr, indicesPtr, lengthsPtr, segments,
+                inLineSize, outLineSize});
+    break;
+  }
+
   case Kinded::Kind::SparseToDenseInstKind: {
     auto *STDI = llvm::cast<SparseToDenseInst>(I);
     auto *indices = STDI->getIndices();
 
@@ -1057,6 +1057,30 @@ void libjit_rowwise_quantized_sparse_lengths_weighted_sum_f(
   }
 }
 
+void libjit_fused_rowwise_quantized_sparse_lengths_weighted_sum_f(
+    float *dest, int8_t *data, float *weights, size_t *indices,
+    int32_t *lengths, size_t segments, size_t inLineSize, size_t outLineSize) {
+  memset(dest, 0, segments * outLineSize * sizeof(float));
+  size_t curIndex = 0;
+  for (size_t i = 0; i < segments; i++) {
+    for (int32_t j = 0, e = lengths[i]; j < e; j++) {
+      const float weight = weights[curIndex];
+      const size_t line = indices[curIndex];
+      const int8_t *currRowScaleOffsetPtr =
+          data + ((line + 1) * inLineSize) - 8;
+      float scale;
+      int32_t offset;
+      memcpy(&scale, currRowScaleOffsetPtr, sizeof(float));
+      memcpy(&offset, currRowScaleOffsetPtr + 4, sizeof(int32_t));
+      for (size_t k = 0; k < outLineSize; k++) {
+        const float fData = scale * (data[line * inLineSize + k] - offset);
+        dest[i * outLineSize + k] += weight * fData;
+      }
+      curIndex++;
+    }
+  }
+}
+
 void libjit_sparse_to_dense_f(float *dest, const size_t *indices,
                               const float *values, size_t numIndices,
                               size_t destSize, size_t valueSize) {