From 8c6229fe21319e5b591405b3a9da14af9850e035 Mon Sep 17 00:00:00 2001
From: Jordan Fix <1198212+jfix71@users.noreply.github.com>
Date: Fri, 8 Feb 2019 15:34:49 -0800
Subject: [PATCH 1/8] [Graph] Fix comments for RWQ-SLWS/SLS

---
 include/glow/Graph/Graph.h | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/glow/Graph/Graph.h b/include/glow/Graph/Graph.h
index 4ae088972c..5413619495 100644
--- a/include/glow/Graph/Graph.h
+++ b/include/glow/Graph/Graph.h
@@ -573,18 +573,19 @@ class Function final : public Named {
                                  NodeValue data, NodeValue weights,
                                  NodeValue indices, NodeValue lengths);
 
-  /// Create a node, performing SparseLengthsSum operation, using rowwise
-  /// quantization for the input data. Gathers slices of the outer-most
-  /// dimension of Data indexed by Indices vector, and then accumulates them
-  /// into len(Lengths) entries: first Lengths[0] slices are aggregated to
-  /// Result[0], next Lengths[1] slices are aggregated to Result[1],
-  /// etc. I.e. sum(Lengths) must be equal to len(Indices).
+  /// Creates and \returns a node of \p name, performing the SparseLengthsSum
+  /// operation, using rowwise quantization for the input \p data with the \p
+  /// scales and \p offsets as separate input tensors. Gathers slices of the
+  /// outer-most dimension of data indexed by the \p indices vector, and then
+  /// accumulates them into len(\p lengths) entries: first Lengths[0] slices are
+  /// aggregated to Result[0], next Lengths[1] slices are aggregated to
+  /// Result[1], etc. I.e. sum(Lengths) must be equal to len(Indices).
   RowwiseQuantizedSparseLengthsWeightedSumNode *
-  createRowwiseQuantizedSparseLengthsWeightedSum(
-      llvm::StringRef name, Constant *data, Constant *scales, Constant *offsets,
-      NodeValue weights, NodeValue indices, NodeValue lengths);
+  createRowwiseQuantizedSparseLengthsSum(llvm::StringRef name, Constant *data,
+                                         Constant *scales, Constant *offsets,
+                                         NodeValue indices, NodeValue lengths);
 
-  /// Same as \ref createRowwiseQuantizedSparseLengthsWeightedSum(), but expects
+  /// Same as \ref createRowwiseQuantizedSparseLengthsSum(), but expects
   /// float input \p data, which is rowwise-quantized internally.
   RowwiseQuantizedSparseLengthsWeightedSumNode *
   createRowwiseQuantizedSparseLengthsSum(llvm::StringRef name, Tensor &data,
@@ -593,11 +594,11 @@ class Function final : public Named {
   /// Same as \ref createRowwiseQuantizedSparseLengthsSum(), but i-th slice is
   /// multiplied by weights[i]. len(weights) must be equal to len(indices).
   RowwiseQuantizedSparseLengthsWeightedSumNode *
-  createRowwiseQuantizedSparseLengthsSum(llvm::StringRef name, Constant *data,
-                                         Constant *scales, Constant *offsets,
-                                         NodeValue indices, NodeValue lengths);
+  createRowwiseQuantizedSparseLengthsWeightedSum(
+      llvm::StringRef name, Constant *data, Constant *scales, Constant *offsets,
+      NodeValue weights, NodeValue indices, NodeValue lengths);
 
-  /// Same as \ref createRowwiseQuantizedSparseLengthsSum(), but expects
+  /// Same as \ref createRowwiseQuantizedSparseLengthsWeightedSum(), but expects
   /// float input \p data, which is rowwise-quantized internally.
   RowwiseQuantizedSparseLengthsWeightedSumNode *
   createRowwiseQuantizedSparseLengthsWeightedSum(llvm::StringRef name,

From 914d10f7041fd2a7e6042fc96a59a8a40c430428 Mon Sep 17 00:00:00 2001
From: Jordan Fix <1198212+jfix71@users.noreply.github.com>
Date: Fri, 8 Feb 2019 10:50:07 -0800
Subject: [PATCH 2/8] Add Int8FusedQTy

---
 include/glow/Base/Tensor.h     | 16 ++++++++++++++++
 include/glow/Base/Type.h       | 24 +++++++++++++++---------
 lib/Backends/CPU/LLVMIRGen.cpp |  4 ++++
 lib/Base/Tensor.cpp            | 15 +++++++++++++++
 4 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/include/glow/Base/Tensor.h b/include/glow/Base/Tensor.h
index 8618c9bfae..5d89793c74 100644
--- a/include/glow/Base/Tensor.h
+++ b/include/glow/Base/Tensor.h
@@ -288,6 +288,17 @@ class Tensor final {
       return false;
     }
 
+    // For now, make sure that either both or neither of the tensors have
+    // Int8FusedQTy. While it is possible for an Int8QTy tensor to equal a
+    // Int8FusedQTy tensor if the Int8FusedQTy tensor has the same scale/offset
+    // on all of its rows, and that scale/offset match that of the Int8QTy, we
+    // do not support checking this for now.
+    assert(((getElementType() == ElemKind::Int8FusedQTy &&
+             other.getElementType() == ElemKind::Int8FusedQTy) ||
+            (getElementType() != ElemKind::Int8FusedQTy &&
+             other.getElementType() != ElemKind::Int8FusedQTy)) &&
+           "Int8FusedQTy only supports comparing against same ElemKind.");
+
     switch (getElementType()) {
     case ElemKind::FloatTy:
       return isEqualImpl<float>(other, allowedError);
@@ -315,6 +326,11 @@ class Tensor final {
       return isEqualImpl<int32_t>(other, allowedError);
     case ElemKind::Int64ITy:
       return isEqualImpl<int64_t>(other, allowedError);
+      // Note: We can use isEqualImpl() here because the scales/offsets will be
+      // compared as if they were data, so we will return false if any rowwise
+      // scale/offset do not match.
+    case ElemKind::Int8FusedQTy:
+      return isEqualImpl<int8_t>(other, allowedError);
     }
 
     // This is to make compiler happy. It can never reach this point as switch
diff --git a/include/glow/Base/Type.h b/include/glow/Base/Type.h
index 9b6f2ab729..1b7622f885 100644
--- a/include/glow/Base/Type.h
+++ b/include/glow/Base/Type.h
@@ -185,13 +185,14 @@ inline bool operator==(const ShapeNCHW &LHS, const ShapeNCHW &RHS) {
 /// An enum representing the type used by the elements of a tensor. The types of
 /// Handles for these tensors should match the element kind.
 enum class ElemKind : unsigned char {
-  FloatTy,   // 32-bit float type (float)
-  Float16Ty, // 16-bit float type (half, fp16)
-  Int8QTy,   // 8-bit quantized type (int8_t)
-  Int16QTy,  // 16-bit quantized type (int16_t)
-  Int32QTy,  // 32-bit quantized type (int32_t)
-  Int32ITy,  // 32-bit index type (int32_t)
-  Int64ITy,  // 64-bit index type (int64_t)
+  FloatTy,      // 32-bit float type (float)
+  Float16Ty,    // 16-bit float type (half, fp16)
+  Int8QTy,      // 8-bit quantized type (int8_t)
+  Int16QTy,     // 16-bit quantized type (int16_t)
+  Int32QTy,     // 32-bit quantized type (int32_t)
+  Int32ITy,     // 32-bit index type (int32_t)
+  Int64ITy,     // 64-bit index type (int64_t)
+  Int8FusedQTy, // 8-bit quantized type with fused scale/offset (int8_t)
 };
 
 /// A class that represents a type of a tensor.
@@ -360,6 +361,8 @@ struct Type final {
       return std::is_same<ElemTy, int32_t>::value;
     case ElemKind::Int64ITy:
       return std::is_same<ElemTy, int64_t>::value;
+    case ElemKind::Int8FusedQTy:
+      return std::is_same<ElemTy, int8_t>::value;
     }
     GLOW_UNREACHABLE("Invalid type.");
   }
@@ -368,7 +371,8 @@ struct Type final {
   bool isQuantizedType() const {
     return elementType_ == ElemKind::Int8QTy ||
            elementType_ == ElemKind::Int16QTy ||
-           elementType_ == ElemKind::Int32QTy;
+           elementType_ == ElemKind::Int32QTy ||
+           elementType_ == ElemKind::Int8FusedQTy;
   }
 
   /// \returns true if the type of this Tensor is one of the floating point
@@ -401,6 +405,8 @@ struct Type final {
       return sizeof(int32_t);
     case ElemKind::Int64ITy:
       return sizeof(int64_t);
+    case ElemKind::Int8FusedQTy:
+      return sizeof(int8_t);
     }
     GLOW_UNREACHABLE("Invalid type.");
   }
@@ -413,7 +419,7 @@ struct Type final {
   /// \return the textual name of the element \p Ty.
   static llvm::StringRef getElementName(ElemKind Ty) {
     static const char *names[] = {
-        "float", "float16", "i8", "i16", "i32", "index32", "index64",
+        "float", "float16", "i8", "i16", "i32", "index32", "index64", "i8fused",
     };
     return names[(int)Ty];
   }
diff --git a/lib/Backends/CPU/LLVMIRGen.cpp b/lib/Backends/CPU/LLVMIRGen.cpp
index dd1857a258..1816b381cf 100644
--- a/lib/Backends/CPU/LLVMIRGen.cpp
+++ b/lib/Backends/CPU/LLVMIRGen.cpp
@@ -237,6 +237,8 @@ llvm::Type *LLVMIRGen::getElementType(llvm::IRBuilder<> &builder,
     return builder.getInt32Ty();
   case ElemKind::Int32ITy:
     return builder.getInt32Ty();
+  case ElemKind::Int8FusedQTy:
+    return builder.getInt8Ty();
   }
   return nullptr;
 }
@@ -469,6 +471,8 @@ llvm::Value *LLVMIRGen::emitConst(llvm::IRBuilder<> &builder, float val,
     return builder.getInt32(static_cast<int32_t>(val));
   case ElemKind::Int32ITy:
     return builder.getInt32(static_cast<int32_t>(val));
+  case ElemKind::Int8FusedQTy:
+    return builder.getInt8(static_cast<int8_t>(val));
   }
   llvm_unreachable("Unknown element type");
 }
diff --git a/lib/Base/Tensor.cpp b/lib/Base/Tensor.cpp
index 6dc8c8e8cf..e441a648eb 100644
--- a/lib/Base/Tensor.cpp
+++ b/lib/Base/Tensor.cpp
@@ -283,6 +283,8 @@ void glow::dumpAsciiImpl(const Tensor *T, llvm::raw_ostream &os) {
     return dumpAsciiGenericImpl(T->getHandle<int32_t>(), os);
   case ElemKind::Int64ITy:
     return dumpAsciiGenericImpl(T->getHandle<int64_t>(), os);
+  case ElemKind::Int8FusedQTy:
+    return dumpAsciiGenericImpl(T->getHandle<int8_t>(), os);
   }
 }
 
@@ -304,6 +306,8 @@ void glow::dumpImpl(const Tensor *T, llvm::raw_ostream &os) {
     return dumpGenericImpl(T->getHandle<int32_t>(), os);
   case ElemKind::Int64ITy:
     return dumpGenericImpl(T->getHandle<int64_t>(), os);
+  case ElemKind::Int8FusedQTy:
+    return dumpGenericImpl(T->getHandle<int8_t>(), os);
   }
 }
 
@@ -368,6 +372,9 @@ void glow::genericTranspose(const Tensor *src, Tensor *dest,
     transposeSelectImpl(srcH, destH, shuffle);
     return;
   }
+  case ElemKind::Int8FusedQTy: {
+    llvm_unreachable("Transposing Int8FusedQTy is unsupported.");
+  }
   }
 }
 
@@ -415,6 +422,10 @@ void Tensor::init(InitKind init, float val, PseudoRNG &PRNG) {
       getHandle<int64_t>().clear(val);
       break;
     }
+    case ElemKind::Int8FusedQTy: {
+      getHandle<int8_t>().clear(val);
+      break;
+    }
     }
     break;
   }
@@ -449,6 +460,10 @@ void Tensor::init(InitKind init, float val, PseudoRNG &PRNG) {
       getHandle<int64_t>().initXavier(val, PRNG);
       break;
     }
+    case ElemKind::Int8FusedQTy: {
+      getHandle<int8_t>().initXavier(val, PRNG);
+      break;
+    }
     }
     break;
   }

From 7c40989911ef53414b684c19d6b454e4a64001b6 Mon Sep 17 00:00:00 2001
From: Jordan Fix <1198212+jfix71@users.noreply.github.com>
Date: Fri, 8 Feb 2019 10:50:07 -0800
Subject: [PATCH 3/8] [Quantization/Base] Add method for quantizing a fused
 tensor

---
 include/glow/Quantization/Base/Base.h |  9 +++++++
 lib/Quantization/Base/Base.cpp        | 35 +++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/include/glow/Quantization/Base/Base.h b/include/glow/Quantization/Base/Base.h
index abbe2ef85d..e628fd031c 100644
--- a/include/glow/Quantization/Base/Base.h
+++ b/include/glow/Quantization/Base/Base.h
@@ -136,6 +136,15 @@ std::vector<int8_t> createMapping(TypeRef inTy, TypeRef outTy,
 void tensorRowwiseQuantization(const Tensor &input, Tensor &output,
                                Tensor &scales, Tensor &offsets);
 
+/// Fused-rowwise quantize the tensor \p input. Scales and offsets are generated
+/// from each row of \p input. \p output is tensor of the same shape as input
+/// but with 8 extra columns for storing fused scales (4 bytes (columns) for
+/// float) and offset (4 bytes (columns) for int32_t).
+/// \pre input.dims().size() == 2
+/// \pre output.dims().size() == 2
+/// \pre input.dims()[1] + 8 == output.dims()[1]
+void tensorFusedRowwiseQuantization(const Tensor &input, Tensor &output);
+
 } // namespace quantization
 } // namespace glow
 
diff --git a/lib/Quantization/Base/Base.cpp b/lib/Quantization/Base/Base.cpp
index a09b9f33be..bd94cf1ff6 100644
--- a/lib/Quantization/Base/Base.cpp
+++ b/lib/Quantization/Base/Base.cpp
@@ -393,5 +393,40 @@ void tensorRowwiseQuantization(const Tensor &input, Tensor &output,
   }
 }
 
+void tensorFusedRowwiseQuantization(const Tensor &input, Tensor &output) {
+  // We are fusing the float scale and int32_t offset onto the end of each
+  // row. Thus input and output must both be 2 dimensional, with output having 8
+  // extra columns for 4 bytes for float scale, and 4 bytes for int32_t offset.
+  assert(input.dims().size() == 2 && output.dims().size() == 2 &&
+         "Input and output must be 2 dimensional.");
+  assert(input.dims()[1] + 8 == output.dims()[1] &&
+         "Output must have 8 more columns than input.");
+
+  const size_t outWidth = output.dims()[1];
+  char *dataBasePtr = output.getUnsafePtr();
+
+  auto srcH = input.getHandle<float>();
+  auto destH = output.getHandle<int8_t>();
+  for (size_t i = 0, e = input.dims()[0]; i < e; i++) {
+    auto slice = srcH.extractSlice(i);
+    auto rSrc = slice.getHandle<float>();
+    auto res = rSrc.minMaxArg();
+    float min = rSrc.raw(res.first);
+    float max = rSrc.raw(res.second);
+
+    // Set the dest's actual data based on the calculated scale/offset.
+    TensorQuantizationParams qParams =
+        chooseQuantizationParams(min, max, quantization::Schema::Asymmetric);
+    for (size_t j = 0, f = input.dims()[1]; j < f; j++) {
+      destH.at({i, j}) = quantization::quantize(srcH.at({i, j}), qParams);
+    }
+
+    // Now set the scale/offset at the end of each row.
+    char *currRowScaleOffsetPtr = dataBasePtr + (i + 1) * outWidth - 8;
+    memcpy(currRowScaleOffsetPtr, &qParams.scale, 4);
+    memcpy(currRowScaleOffsetPtr + 4, &qParams.offset, 4);
+  }
+}
+
 } // namespace quantization
 } // namespace glow

From c00113fa1e32ec40a413c6f5f53d2bcadc7e610b Mon Sep 17 00:00:00 2001
From: Jordan Fix <1198212+jfix71@users.noreply.github.com>
Date: Fri, 8 Feb 2019 15:46:12 -0800
Subject: [PATCH 4/8] [New Operator] Add Interpreter support for
 FusedRowwiseQuantizedSparseLengthsWeightedSumNode and
 FusedRowwiseQuantizedSparseLengthsSumNode

---
 include/glow/Graph/Graph.h                    |  39 +++++++
 lib/Backends/Interpreter/InterpreterNodes.cpp |  54 +++++++++
 lib/Graph/Graph.cpp                           |  73 ++++++++++++
 lib/Graph/Nodes.cpp                           |  32 ++++++
 tests/unittests/OperatorTest.cpp              | 108 ++++++++++++++++++
 tools/ClassGen/InstrGen.cpp                   |  17 +++
 tools/ClassGen/NodeGen.cpp                    |  19 +++
 7 files changed, 342 insertions(+)

diff --git a/include/glow/Graph/Graph.h b/include/glow/Graph/Graph.h
index 5413619495..5ba49262fa 100644
--- a/include/glow/Graph/Graph.h
+++ b/include/glow/Graph/Graph.h
@@ -607,6 +607,45 @@ class Function final : public Named {
                                                  NodeValue indices,
                                                  NodeValue lengths);
 
+  /// Creates and \returns a node of \p name, performing the SparseLengthsSum
+  /// operation, using fused rowwise quantization for the input \p data wherein
+  /// the scales and offsets are fused inline with each row of data. \p data
+  /// must be ElemKind::Int8FusedQTy. Gathers slices of the outer-most dimension
+  /// of data indexed by the \p indices vector, and then accumulates them into
+  /// len(\p lengths) entries: first Lengths[0] slices are aggregated to
+  /// Result[0], next Lengths[1] slices are aggregated to Result[1],
+  /// etc. I.e. sum(Lengths) must be equal to len(Indices).
+  FusedRowwiseQuantizedSparseLengthsWeightedSumNode *
+  createFusedRowwiseQuantizedSparseLengthsSum(llvm::StringRef name,
+                                              Constant *data, NodeValue indices,
+                                              NodeValue lengths);
+
+  /// Same as \ref createFusedRowwiseQuantizedSparseLengthsSum(), but expects
+  /// float input \p data, which is rowwise-quantized and fused internally.
+  FusedRowwiseQuantizedSparseLengthsWeightedSumNode *
+  createFusedRowwiseQuantizedSparseLengthsSum(llvm::StringRef name,
+                                              Tensor &data, NodeValue indices,
+                                              NodeValue lengths);
+
+  /// Same as \ref createFusedRowwiseQuantizedSparseLengthsSum(), but i-th slice
+  /// is multiplied by weights[i]. len(weights) must be equal to len(indices).
+  FusedRowwiseQuantizedSparseLengthsWeightedSumNode *
+  createFusedRowwiseQuantizedSparseLengthsWeightedSum(llvm::StringRef name,
+                                                      Tensor &data,
+                                                      NodeValue weights,
+                                                      NodeValue indices,
+                                                      NodeValue lengths);
+
+  /// Same as \ref createFusedRowwiseQuantizedSparseLengthsWeightedSum(), but
+  /// expects float input \p data, which is rowwise-quantized and fused
+  /// internally.
+  FusedRowwiseQuantizedSparseLengthsWeightedSumNode *
+  createFusedRowwiseQuantizedSparseLengthsWeightedSum(llvm::StringRef name,
+                                                      Constant *data,
+                                                      NodeValue weights,
+                                                      NodeValue indices,
+                                                      NodeValue lengths);
+
   /// Given a vector of segment lengths, calculates offsets of each segment and
   /// packs them next to the lengths. For the input vector of length N the
   /// output is a Nx2 matrix with (offset, lengths) packaged for each segment.
diff --git a/lib/Backends/Interpreter/InterpreterNodes.cpp b/lib/Backends/Interpreter/InterpreterNodes.cpp
index 88a8ab6aeb..acc4fe5602 100644
--- a/lib/Backends/Interpreter/InterpreterNodes.cpp
+++ b/lib/Backends/Interpreter/InterpreterNodes.cpp
@@ -2200,6 +2200,60 @@ void BoundInterpreterFunction::fwdRowwiseQuantizedSparseLengthsWeightedSumInst(
   }
 }
 
+void BoundInterpreterFunction::
+    fwdFusedRowwiseQuantizedSparseLengthsWeightedSumInst(
+        const FusedRowwiseQuantizedSparseLengthsWeightedSumInst *I) {
+  auto *out = getTensor(I->getDest());
+  auto *data = getTensor(I->getData());
+  auto *weights = getTensor(I->getWeights());
+  auto *indices = getTensor(I->getIndices());
+  auto *lengths = getTensor(I->getLengths());
+
+  out->zero();
+
+  auto IH = indices->getHandle<int64_t>();
+  auto LH = lengths->getHandle<int32_t>();
+
+  size_t segments = lengths->dims()[0];
+  size_t totalLength = 0;
+  for (size_t i = 0; i < segments; i++) {
+    totalLength += LH.raw(i);
+  }
+  assert(totalLength == indices->dims()[0] &&
+         "sum(Lengths) must be equal to len(Indices)");
+
+  const size_t inLineSize = data->size() / data->dims()[0];
+  const size_t outLineSize = out->size() / out->dims()[0];
+
+  auto DH = data->getHandle<int8_t>();
+  auto WH = weights->getHandle<float>();
+  auto OH = out->getHandle<float>();
+
+  size_t curIdx = 0;
+  for (size_t i = 0; i < segments; i++) {
+    for (size_t j = 0, e = LH.raw(i); j < e; j++) {
+      const float weight = WH.raw(curIdx);
+      const size_t rowIdx = IH.raw(curIdx++);
+      size_t offsetIn = rowIdx * inLineSize;
+      size_t offsetOut = i * outLineSize;
+      // Get the scale and offset from the row; go to the current row and offset
+      // into it up until the last 8 bytes. Use memcpy to get the values out to
+      // avoid alignment issues of accessing 4-byte values.
+      const char *currRowScaleOffsetPtr =
+          data->getUnsafePtr() + offsetIn + inLineSize - 8;
+      float scale;
+      int32_t offset;
+      memcpy(&scale, currRowScaleOffsetPtr, sizeof(float));
+      memcpy(&offset, currRowScaleOffsetPtr + 4, sizeof(int32_t));
+      for (size_t k = 0; k < outLineSize; k++) {
+        float d = quantization::dequantize(
+            DH.raw(offsetIn++), TensorQuantizationParams{scale, offset});
+        OH.raw(offsetOut++) += d * weight;
+      }
+    }
+  }
+}
+
 void BoundInterpreterFunction::fwdLengthsToRangesInst(
     const LengthsToRangesInst *I) {
   auto ranges = getTensor(I->getDest())->getHandle<int32_t>();
diff --git a/lib/Graph/Graph.cpp b/lib/Graph/Graph.cpp
index 8f5b9984fe..5ff3ccda6b 100644
--- a/lib/Graph/Graph.cpp
+++ b/lib/Graph/Graph.cpp
@@ -1447,6 +1447,79 @@ Function::createRowwiseQuantizedSparseLengthsSum(llvm::StringRef name,
       this, name, data, ones, indices, lengths);
 }
 
+FusedRowwiseQuantizedSparseLengthsWeightedSumNode *
+Function::createFusedRowwiseQuantizedSparseLengthsWeightedSum(
+    llvm::StringRef name, Constant *data, NodeValue weights, NodeValue indices,
+    NodeValue lengths) {
+  auto inDims = data->dims();
+  ShapeVector outDims(inDims.begin(), inDims.end());
+  outDims[0] = lengths.dims()[0];
+  // The output column count is the same as the input column count, but without
+  // the extra 8 bytes for the fused scale/offset, as the output is not
+  // Int8FusedQTy.
+  outDims[1] -= 8;
+  auto outTy = getParent()->uniqueType(ElemKind::FloatTy, outDims);
+  return addNode(new FusedRowwiseQuantizedSparseLengthsWeightedSumNode(
+      name, outTy, data, weights, indices, lengths));
+}
+
+FusedRowwiseQuantizedSparseLengthsWeightedSumNode *
+Function::createFusedRowwiseQuantizedSparseLengthsSum(llvm::StringRef name,
+                                                      Constant *data,
+                                                      NodeValue indices,
+                                                      NodeValue lengths) {
+  auto ty = getParent()->uniqueType(ElemKind::FloatTy, {indices.dims()[0]});
+  auto ones = createSplat(name.str() + ".ones", ty, 1.0);
+  return createFusedRowwiseQuantizedSparseLengthsWeightedSum(name, data, ones,
+                                                             indices, lengths);
+}
+
+/// Helper to create a RowwiseQuantizedSparseLengthsWeightedSumNode in the
+/// Function \p F with \p name, using \ data, \p weights, \p indices, and \p
+/// lengths as inputs. The provided float data in \p Tensor is rowwise
+/// quantized, creating Constants for the rowwise quantized data as well as
+/// Scales and Offsets, in the Module containing \p F.
+static FusedRowwiseQuantizedSparseLengthsWeightedSumNode *
+quantizeDataAndCreateFusedRowwiseQuantizedSparseLengthsWeightedSum(
+    Function *F, llvm::StringRef name, Tensor &data, NodeValue weights,
+    NodeValue indices, NodeValue lengths) {
+  // For fused rowwise quantization, we must have a two-dimensional input. If
+  // passed in a single dimensional data Tensor then add an extra dimension.
+  const auto fDims = flattenCdr(data.dims());
+  Tensor fData = data.getUnowned({fDims.first, fDims.second});
+
+  // Note: In rwqData, we are using a quantized type, however the scale/offset
+  // are set to dummy values 0.0/0. This is because the actually used
+  // scale/offset are fused inline with each row. Also, we expand the second
+  // dimension to include space for the scale/offset, each 4 bytes
+  // (float/int32_t).
+  Constant *rwqData = F->getParent()->createConstant(
+      ElemKind::Int8FusedQTy, {fDims.first, fDims.second + 8}, 0.0, 0, "data");
+
+  quantization::tensorFusedRowwiseQuantization(fData, rwqData->getPayload());
+  return F->createFusedRowwiseQuantizedSparseLengthsWeightedSum(
+      name, rwqData, weights, indices, lengths);
+}
+
+FusedRowwiseQuantizedSparseLengthsWeightedSumNode *
+Function::createFusedRowwiseQuantizedSparseLengthsWeightedSum(
+    llvm::StringRef name, Tensor &data, NodeValue weights, NodeValue indices,
+    NodeValue lengths) {
+  return quantizeDataAndCreateFusedRowwiseQuantizedSparseLengthsWeightedSum(
+      this, name, data, weights, indices, lengths);
+}
+
+FusedRowwiseQuantizedSparseLengthsWeightedSumNode *
+Function::createFusedRowwiseQuantizedSparseLengthsSum(llvm::StringRef name,
+                                                      Tensor &data,
+                                                      NodeValue indices,
+                                                      NodeValue lengths) {
+  auto ty = getParent()->uniqueType(ElemKind::FloatTy, {indices.dims()[0]});
+  auto ones = createSplat(name.str() + ".ones", ty, 1.0);
+  return quantizeDataAndCreateFusedRowwiseQuantizedSparseLengthsWeightedSum(
+      this, name, data, ones, indices, lengths);
+}
+
 LengthsToRangesNode *Function::createLengthsToRanges(llvm::StringRef name,
                                                      NodeValue lengths) {
   ShapeVector outDims({lengths.dims()[0], 2});
diff --git a/lib/Graph/Nodes.cpp b/lib/Graph/Nodes.cpp
index ecf723b379..dd020e90f2 100644
--- a/lib/Graph/Nodes.cpp
+++ b/lib/Graph/Nodes.cpp
@@ -747,6 +747,38 @@ bool RowwiseQuantizedSparseLengthsWeightedSumNode::verify() const {
   return isValid;
 }
 
+bool FusedRowwiseQuantizedSparseLengthsWeightedSumNode::verify() const {
+  bool isValid = checkType(getResult(), ElemKind::FloatTy, this);
+  isValid &= checkType(getData(), ElemKind::Int8FusedQTy, this);
+  isValid &= checkType(getWeights(), ElemKind::FloatTy, this);
+  isValid &= checkType(getIndices(), ElemKind::Int64ITy, this);
+  isValid &= checkType(getLengths(), ElemKind::Int32ITy, this);
+  isValid &= expectCompareTrue("Indices must be a 1D vector",
+                               getIndices().dims().size(), size_t(1), this);
+  isValid &= expectCompareTrue("Lengths must be a 1D vector",
+                               getLengths().dims().size(), size_t(1), this);
+  isValid &= expectCompareTrue("Weights must be a 1D vector",
+                               getWeights().dims().size(), size_t(1), this);
+  isValid &=
+      expectCompareTrue("Weights and Indices must have the same size",
+                        getWeights().dims()[0], getIndices().dims()[0], this);
+  isValid &= expectCompareTrue("Data must be 2 dimensional.",
+                               getData().dims().size(), size_t(2), this);
+  isValid &= expectCompareTrue("Data must have more than 8 columns.",
+                               getData().dims()[1], size_t(8), this,
+                               CompareOperatorGreaterEqual<size_t>());
+  isValid &= expectCompareTrue("Result must be 2 dimensional.",
+                               getResult().dims().size(), size_t(2), this);
+  // Wrap this in isValid to prevent potential segfault if the result is
+  // incorrectly shaped.
+  if (isValid) {
+    isValid &= expectCompareTrue(
+        "Result output shape should have second dim as 8 less than Data.",
+        getResult().dims()[1] + 8, getData().dims()[1], this);
+  }
+  return isValid;
+}
+
 bool LengthsToRangesNode::verify() const {
   bool isValid = checkType(getResult(), getLengths().getElementType(), this);
   isValid &= checkType(getLengths(), ElemKind::Int32ITy, this);
diff --git a/tests/unittests/OperatorTest.cpp b/tests/unittests/OperatorTest.cpp
index 9453cd596d..f9fd1561d2 100644
--- a/tests/unittests/OperatorTest.cpp
+++ b/tests/unittests/OperatorTest.cpp
@@ -4145,6 +4145,114 @@ TEST_P(InterpAndCPU, RowwiseQuantizedSparseLengthsSum) {
   EXPECT_TRUE(expected.isEqual(result, 0.02));
 }
 
+TEST_P(InterpAndCPU, FusedRowwiseQuantizedSparseLengthsWeightedSum) {
+  /*
+    DATA  =   [[2.0, -0.5, 13]]
+    WEIGHTS = [3, 1, 0, 0, 0, 0, 2, -0.5]
+    INDICES = [1, 0, 2, 0, 1, 2, 2, 0]
+    LENGTHS = [3, 0, 3, 2]
+    OUTPUT =  [[0.5, 0, 0, 25]]
+  */
+  Tensor data(ElemKind::FloatTy, {3, 1});
+  data.getHandle() = {
+      2.0,
+      -0.5,
+      13,
+  };
+
+  Constant *weights = mod_.createConstant(ElemKind::FloatTy, {8}, "weights");
+  weights->getPayload().getHandle<float>() = {
+      3., 1., 0., 0., 0., 0., 2., -0.5,
+  };
+
+  Placeholder *indices =
+      mod_.createPlaceholder(ElemKind::Int64ITy, {8}, "indices",
+                             /* isTrainable */ false);
+  Placeholder *lengths =
+      mod_.createPlaceholder(ElemKind::Int32ITy, {4}, "lengths",
+                             /* isTrainable */ false);
+
+  ctx_.allocate(indices)->getHandle<int64_t>() = {
+      1, 0, 2, 0, 1, 2, 2, 0,
+  };
+  ctx_.allocate(lengths)->getHandle<int32_t>() = {
+      3,
+      0,
+      3,
+      2,
+  };
+
+  auto *R = F_->createFusedRowwiseQuantizedSparseLengthsWeightedSum(
+      "RQSLWS", data, weights, indices, lengths);
+  SaveNode *S = F_->createSave("save", R);
+  ctx_.allocate(S->getPlaceholder());
+
+  EE_.compile(CompilationMode::Infer, F_);
+  EE_.run(ctx_);
+
+  Tensor &result = *ctx_.get(S->getPlaceholder());
+  Tensor expected(ElemKind::FloatTy, {4, 1});
+  expected.getHandle() = {
+      0.5,
+      0,
+      0,
+      25,
+  };
+
+  EXPECT_TRUE(expected.isEqual(result, 0.02));
+}
+
+TEST_P(InterpAndCPU, FusedRowwiseQuantizedSparseLengthsSum) {
+  /*
+    DATA  = [
+        [1.0, 1.2],
+        [2.3, 3.4],
+        [4.5, 5.7],
+    ]
+    INDICES = [2, 0, 1, 2, 0, 0, 0, 0]
+    LENGTHS = [2, 0, 2, 1, 3]
+    OUTPUT = [
+        [5.5, 6.9],
+        [0.0, 0.0],
+        [6.8, 9.1],
+        [1.0, 1.2],
+        [3.0, 3.6],
+    ]
+  */
+  Tensor data(ElemKind::FloatTy, {3, 2});
+  data.getHandle() = {
+      1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.7f,
+  };
+
+  Placeholder *indices = mod_.createPlaceholder(
+      ElemKind::Int64ITy, {8}, "indices", /* isTrainable */ false);
+  Placeholder *lengths = mod_.createPlaceholder(
+      ElemKind::Int32ITy, {5}, "lengths", /* isTrainable */ false);
+
+  ctx_.allocate(indices)->getHandle<int64_t>() = {
+      2, 0, 1, 2, 0, 0, 0, 0,
+  };
+  ctx_.allocate(lengths)->getHandle<int32_t>() = {
+      2, 0, 2, 1, 3,
+  };
+
+  auto *R = F_->createFusedRowwiseQuantizedSparseLengthsSum("RQSLWS", data,
+                                                            indices, lengths);
+  SaveNode *S = F_->createSave("save", R);
+  ctx_.allocate(S->getPlaceholder());
+
+  EE_.compile(CompilationMode::Infer, F_);
+  EE_.run(ctx_);
+
+  Tensor &result = *ctx_.get(S->getPlaceholder());
+  Tensor expected(ElemKind::FloatTy, {5, 2});
+  expected.getHandle() = {
+      5.5f, 6.9f, 0.0f, 0.0f, 6.8f, 9.1f, 1.0f, 1.2f, 3.0f, 3.6f,
+  };
+
+  EXPECT_TRUE(expected.isEqual(result, 0.02));
+}
+
 TEST_P(InterpAndCPU, SparseToDense) {
   // Create and initialize inputs. Make input 3D to make sure
   // multidimensional values are handled properly.
diff --git a/tools/ClassGen/InstrGen.cpp b/tools/ClassGen/InstrGen.cpp
index 4d62ed71f9..c915a6c392 100644
--- a/tools/ClassGen/InstrGen.cpp
+++ b/tools/ClassGen/InstrGen.cpp
@@ -250,6 +250,23 @@ int main(int argc, char **argv) {
                   {"Lengths", "ElemKind::Int32ITy"})
       .autoVerify(VerifyKind::SameShape, {"Weights", "Indices"});
 
+  BB.newInstr("FusedRowwiseQuantizedSparseLengthsWeightedSum")
+      .addOperand("Dest", OperandKind::Out)
+      .addOperand("Data", OperandKind::In)
+      .addOperand("Weights", OperandKind::In)
+      .addOperand("Indices", OperandKind::In)
+      .addOperand("Lengths", OperandKind::In)
+      .autoIRGen()
+      .autoVerify(VerifyKind::SameElementType, {"Dest", "ElemKind::FloatTy"})
+      .autoVerify(VerifyKind::SameElementType,
+                  {"Data", "ElemKind::Int8FusedQTy"})
+      .autoVerify(VerifyKind::SameElementType, {"Weights", "ElemKind::FloatTy"})
+      .autoVerify(VerifyKind::SameElementType,
+                  {"Indices", "ElemKind::Int64ITy"})
+      .autoVerify(VerifyKind::SameElementType,
+                  {"Lengths", "ElemKind::Int32ITy"})
+      .autoVerify(VerifyKind::SameShape, {"Weights", "Indices"});
+
   BB.newInstr("LengthsToRanges")
       .addOperand("Dest", OperandKind::Out)
       .addOperand("Lengths", OperandKind::In)
diff --git a/tools/ClassGen/NodeGen.cpp b/tools/ClassGen/NodeGen.cpp
index 6b8f61ad5c..1cea7aeb21 100644
--- a/tools/ClassGen/NodeGen.cpp
+++ b/tools/ClassGen/NodeGen.cpp
@@ -355,6 +355,25 @@ int main(int argc, char **argv) {
                     "data is rowwise-quantized, where the Scales and Offsets "
                     "are 1D tensors of length equal to the first dim of Data.");
 
+  BB.newNode("FusedRowwiseQuantizedSparseLengthsWeightedSum")
+      .addInput("Data")
+      .addInput("Weights")
+      .addInput("Indices")
+      .addInput("Lengths")
+      .addResultFromCtorArg()
+      .setDocstring("Gathers slices of the outer-most dimension of Data "
+                    "indexed by Indices vector, and then accumulates them into "
+                    "len(Lengths) entries: first Lengths[0] slices are "
+                    "aggregated to Result[0], next Lengths[1] slices are "
+                    "aggregated to Result[1], etc. I.e. sum(Lengths) must be "
+                    "equal to len(Indices). Before doing aggregation, each "
+                    "individual slice is scaled by its weight: Result[0] = "
+                    "Weights[0] * Slice(0) + Weights[1] * Slice(1) + ... "
+                    "It implies that len(Weights) == len(Indices). The input "
+                    "data is fused rowwise-quantized, where the Scales and "
+                    "Offsets are appended to the end of each row. Thus, Data "
+                    "must be a two-dimensional tensor.");
+
   BB.newNode("LengthsToRanges")
       .addInput("Lengths")
       .addResultFromCtorArg()

From 23182ed7cf75aebd539f53b364e12f8bf2341b9c Mon Sep 17 00:00:00 2001
From: Jordan Fix <1198212+jfix71@users.noreply.github.com>
Date: Fri, 8 Feb 2019 16:16:41 -0800
Subject: [PATCH 5/8] [New Operator] Add CPU support for
 FusedRowwiseQuantizedSparseLengthsWeightedSumNode and
 FusedRowwiseQuantizedSparseLengthsSumNode

---
 lib/Backends/CPU/LLVMIRGen.cpp     | 26 ++++++++++++++++++++++++++
 lib/Backends/CPU/libjit/libjit.cpp | 24 ++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/lib/Backends/CPU/LLVMIRGen.cpp b/lib/Backends/CPU/LLVMIRGen.cpp
index 1816b381cf..e826823887 100644
--- a/lib/Backends/CPU/LLVMIRGen.cpp
+++ b/lib/Backends/CPU/LLVMIRGen.cpp
@@ -326,6 +326,9 @@ llvm::Value *LLVMIRGen::emitValueAddress(llvm::IRBuilder<> &builder,
   case ElemKind::Int32ITy:
     T = llvm::Type::getInt32PtrTy(ctx_);
     break;
+  case ElemKind::Int8FusedQTy:
+    T = llvm::Type::getInt8PtrTy(ctx_);
+    break;
   default:
     llvm_unreachable("Unimplemented");
     break;
@@ -2322,6 +2325,29 @@ void LLVMIRGen::generateLLVMIRForInstr(llvm::IRBuilder<> &builder,
     break;
   }
 
+  case Kinded::Kind::FusedRowwiseQuantizedSparseLengthsWeightedSumInstKind: {
+    auto *N = cast<FusedRowwiseQuantizedSparseLengthsWeightedSumInst>(I);
+    auto *dest = N->getDest();
+    auto *data = N->getData();
+    auto *weights = N->getWeights();
+    auto *indices = N->getIndices();
+    auto *lengths = N->getLengths();
+    auto *destPtr = emitValueAddress(builder, dest);
+    auto *dataPtr = emitValueAddress(builder, data);
+    auto *weightsPtr = emitValueAddress(builder, weights);
+    auto *indicesPtr = emitValueAddress(builder, indices);
+    auto *lengthsPtr = emitValueAddress(builder, lengths);
+    auto *segments = emitConstSizeT(builder, lengths->dims()[0]);
+    auto *inLineSize = emitConstSizeT(builder, data->size() / data->dims()[0]);
+    auto *outLineSize = emitConstSizeT(builder, dest->size() / dest->dims()[0]);
+    auto *F = getFunction("fused_rowwise_quantized_sparse_lengths_weighted_sum",
+                          dest->getElementType());
+    createCall(builder, F,
+               {destPtr, dataPtr, weightsPtr, indicesPtr, lengthsPtr, segments,
+                inLineSize, outLineSize});
+    break;
+  }
+
   case Kinded::Kind::SparseToDenseInstKind: {
     auto *STDI = llvm::cast<SparseToDenseInst>(I);
     auto *indices = STDI->getIndices();
diff --git a/lib/Backends/CPU/libjit/libjit.cpp b/lib/Backends/CPU/libjit/libjit.cpp
index 84297cfeff..088ee26bb7 100644
--- a/lib/Backends/CPU/libjit/libjit.cpp
+++ b/lib/Backends/CPU/libjit/libjit.cpp
@@ -1057,6 +1057,30 @@ void libjit_rowwise_quantized_sparse_lengths_weighted_sum_f(
   }
 }
 
+void libjit_fused_rowwise_quantized_sparse_lengths_weighted_sum_f(
+    float *dest, int8_t *data, float *weights, size_t *indices,
+    int32_t *lengths, size_t segments, size_t inLineSize, size_t outLineSize) {
+  memset(dest, 0, segments * outLineSize * sizeof(float));
+  size_t curIndex = 0;
+  for (size_t i = 0; i < segments; i++) {
+    for (int32_t j = 0, e = lengths[i]; j < e; j++) {
+      const float weight = weights[curIndex];
+      const size_t line = indices[curIndex];
+      const int8_t *currRowScaleOffsetPtr =
+          data + ((line + 1) * inLineSize) - 8;
+      float scale;
+      int32_t offset;
+      memcpy(&scale, currRowScaleOffsetPtr, sizeof(float));
+      memcpy(&offset, currRowScaleOffsetPtr + 4, sizeof(int32_t));
+      for (size_t k = 0; k < outLineSize; k++) {
+        const float fData = scale * (data[line * inLineSize + k] - offset);
+        dest[i * outLineSize + k] += weight * fData;
+      }
+      curIndex++;
+    }
+  }
+}
+
 void libjit_sparse_to_dense_f(float *dest, const size_t *indices,
                               const float *values, size_t numIndices,
                               size_t destSize, size_t valueSize) {

From 013791ecae0b94119d90a23d0f42c3e1c3c549b1 Mon Sep 17 00:00:00 2001
From: Jordan Fix <1198212+jfix71@users.noreply.github.com>
Date: Fri, 8 Feb 2019 16:24:18 -0800
Subject: [PATCH 6/8] [Caffe2ImporterTest] Add fused RWQ-SLWS/SLS tests

---
 lib/Importer/Caffe2ModelLoader.cpp            | 133 ++++++++----
 ...uantized_sparse_lengths_sum_init_net.pbtxt |  22 ++
 ...tized_sparse_lengths_sum_predict_net.pbtxt |  12 ++
 ...sparse_lengths_weighted_sum_init_net.pbtxt |  41 ++++
 ...rse_lengths_weighted_sum_predict_net.pbtxt |  13 ++
 tests/unittests/Caffe2ImporterTest.cpp        | 193 ++++++++++++++++++
 6 files changed, 373 insertions(+), 41 deletions(-)
 create mode 100644 tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_init_net.pbtxt
 create mode 100644 tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_predict_net.pbtxt
 create mode 100644 tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_init_net.pbtxt
 create mode 100644 tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_predict_net.pbtxt

diff --git a/lib/Importer/Caffe2ModelLoader.cpp b/lib/Importer/Caffe2ModelLoader.cpp
index 301b0f2ff2..c8a1d54424 100644
--- a/lib/Importer/Caffe2ModelLoader.cpp
+++ b/lib/Importer/Caffe2ModelLoader.cpp
@@ -874,13 +874,21 @@ llvm::Error Caffe2ModelLoader::loadOperator(const caffe2::OperatorDef &op) {
   }
 
   if (typeName == "SparseLengthsWeightedSum8BitsRowwise" ||
-      typeName == "SparseLengthsSum8BitsRowwise") {
-    // If SparseLengthsWeightedSum8BitsRowwise, then the weights are the second
-    // input and so we need to shift indices/lengths/scalesBiases.
+      typeName == "SparseLengthsSum8BitsRowwise" ||
+      typeName == "SparseLengthsWeightedSumFused8BitRowwise" ||
+      typeName == "SparseLengthsSumFused8BitRowwise") {
+    const bool isWeighted =
+        typeName == "SparseLengthsWeightedSum8BitsRowwise" ||
+        typeName == "SparseLengthsWeightedSumFused8BitRowwise";
+    const bool isFused =
+        typeName == "SparseLengthsWeightedSumFused8BitRowwise" ||
+        typeName == "SparseLengthsSumFused8BitRowwise";
+    // If weighted, then the weights are the second input and so we need to
+    // shift indices/lengths/scalesBiases.
     size_t indicesIdx = 1;
     size_t lengthsIdx = 2;
     size_t scalesBiasesIdx = 3;
-    if (typeName == "SparseLengthsWeightedSum8BitsRowwise") {
+    if (isWeighted) {
       indicesIdx++;
       lengthsIdx++;
       scalesBiasesIdx++;
@@ -889,60 +897,103 @@ llvm::Error Caffe2ModelLoader::loadOperator(const caffe2::OperatorDef &op) {
     NodeValue data;
     ASSIGN_VALUE_OR_RETURN_ERR(data,
                                getNodeValueOrCreateConstantByName(op.input(0)));
+    NodeValue weights;
+    if (isWeighted) {
+      ASSIGN_VALUE_OR_RETURN_ERR(
+          weights, getNodeValueOrCreateConstantByName(op.input(1)));
+    }
     NodeValue indices;
     ASSIGN_VALUE_OR_RETURN_ERR(
         indices, getNodeValueOrCreateConstantByName(op.input(indicesIdx)));
     NodeValue lengths;
     ASSIGN_VALUE_OR_RETURN_ERR(
         lengths, getNodeValueOrCreateConstantByName(op.input(lengthsIdx)));
-    NodeValue scalesBiases;
-    ASSIGN_VALUE_OR_RETURN_ERR(scalesBiases, getNodeValueOrCreateConstantByName(
-                                                 op.input(scalesBiasesIdx)));
-
-    Constant *scalesBiasesC = llvm::dyn_cast<Constant>(scalesBiases);
-    RETURN_ERR_IF_NOT(scalesBiasesC, "scales_biases must be Constant.");
     Constant *dataC = llvm::dyn_cast<Constant>(data);
-    RETURN_ERR_IF_NOT(dataC->getElementType() == ElemKind::Int8QTy,
-                      "Data must be Int8QTy.");
 
     const size_t numRows = data.dims()[0];
 
     // Make sure all the shapes make sense.
     RETURN_ERR_IF_NOT(lengths.dims().size() == 1, "lengths must be a vector.");
     RETURN_ERR_IF_NOT(indices.dims().size() == 1, "indices must be a vector.");
-    RETURN_ERR_IF_NOT(scalesBiases.dims().size() == 2,
-                      "scale_bias has to be a matrix.");
-    RETURN_ERR_IF_NOT(scalesBiases.dims()[0] == numRows,
-                      "scale_bias must have the same number of rows as data.");
-    RETURN_ERR_IF_NOT(scalesBiases.dims()[1] == 2,
-                      "Second dim of scale_bias has to be equal to 2.");
-
-    // Now strip out the scales and biases into their own tensors.
-    Constant *dataScales = G_.getParent()->createConstant(
-        ElemKind::FloatTy, {numRows}, "dataScales");
-    Constant *dataOffsets = G_.getParent()->createConstant(
-        ElemKind::Int32ITy, {numRows}, "dataOffsets");
-
-    auto dataScalesH = dataScales->getHandle<float>();
-    auto dataOffsetsH = dataOffsets->getHandle<int32_t>();
-    auto scalesBiasesH = scalesBiasesC->getHandle<float>();
-    for (size_t i = 0, e = numRows; i < e; i++) {
-      dataScalesH.at({i}) = scalesBiasesH.at({i, 0});
-      // Caffe2 represents offsets (bias) using float, while Glow uses int32_t.
-      dataOffsetsH.at({i}) = static_cast<int32_t>(scalesBiasesH.at({i, 1}));
-    }
 
     Node *node;
-    if (typeName == "SparseLengthsWeightedSum8BitsRowwise") {
-      NodeValue weights;
-      ASSIGN_VALUE_OR_RETURN_ERR(
-          weights, getNodeValueOrCreateConstantByName(op.input(1)));
-      node = G_.createRowwiseQuantizedSparseLengthsWeightedSum(
-          opName, dataC, dataScales, dataOffsets, weights, indices, lengths);
+    if (isFused) {
+      // There is no specific fused quantized type in Caffe2, so we will load
+      // Int8QTy. We then change it from Int8QTy to Int8FusedQTy here if
+      // necessary -- another user could have already changed it.
+      if (dataC->getElementType() != ElemKind::Int8FusedQTy) {
+        RETURN_ERR_IF_NOT(dataC->getElementType() == ElemKind::Int8QTy,
+                          "Data must be Int8QTy.");
+        // Use dummy 0.0/0 as scale/offset, since the actual scales/offsets are
+        // fused inline with the data.
+        TypeRef fusedTy = G_.getParent()->uniqueType(ElemKind::Int8FusedQTy,
+                                                     dataC->dims(), 0.0, 0);
+        dataC->setType(Storage::OutputIdx, fusedTy);
+      }
+
+      // Caffe2 stores offsets as floats, whereas we want to use int32_t.
+      char *dataBasePtr = dataC->getPayload().getUnsafePtr();
+      const size_t width = dataC->dims()[1];
+      for (size_t i = 0, e = dataC->dims()[0]; i < e; ++i) {
+        // Must memcpy to the stack and back to avoid misaligned addresses.
+        char *currRowOffsetPtr = dataBasePtr + (i + 1) * width - 4;
+        float fOffset;
+        memcpy(&fOffset, currRowOffsetPtr, 4);
+        int32_t iOffset = static_cast<int32_t>(fOffset);
+        memcpy(currRowOffsetPtr, &iOffset, 4);
+      }
+
+      // No other work to do, since the data is already loaded fused, so just
+      // create the new node with its inputs.
+      if (isWeighted) {
+        node = G_.createFusedRowwiseQuantizedSparseLengthsWeightedSum(
+            opName, dataC, weights, indices, lengths);
+      } else {
+        node = G_.createFusedRowwiseQuantizedSparseLengthsSum(opName, dataC,
+                                                              indices, lengths);
+      }
     } else {
-      node = G_.createRowwiseQuantizedSparseLengthsSum(
-          opName, dataC, dataScales, dataOffsets, indices, lengths);
+      NodeValue scalesBiases;
+      ASSIGN_VALUE_OR_RETURN_ERR(
+          scalesBiases,
+          getNodeValueOrCreateConstantByName(op.input(scalesBiasesIdx)));
+
+      Constant *scalesBiasesC = llvm::dyn_cast<Constant>(scalesBiases);
+      RETURN_ERR_IF_NOT(scalesBiasesC, "scales_biases must be Constant.");
+      RETURN_ERR_IF_NOT(scalesBiases.dims().size() == 2,
+                        "scale_bias has to be a matrix.");
+      RETURN_ERR_IF_NOT(
+          scalesBiases.dims()[0] == numRows,
+          "scale_bias must have the same number of rows as data.");
+      RETURN_ERR_IF_NOT(scalesBiases.dims()[1] == 2,
+                        "Second dim of scale_bias has to be equal to 2.");
+
+      // Now strip out the scales and biases into their own tensors.
+      Constant *dataScales = G_.getParent()->createConstant(
+          ElemKind::FloatTy, {numRows}, "dataScales");
+      Constant *dataOffsets = G_.getParent()->createConstant(
+          ElemKind::Int32ITy, {numRows}, "dataOffsets");
+
+      auto dataScalesH = dataScales->getHandle<float>();
+      auto dataOffsetsH = dataOffsets->getHandle<int32_t>();
+      auto scalesBiasesH = scalesBiasesC->getHandle<float>();
+      for (size_t i = 0, e = numRows; i < e; i++) {
+        dataScalesH.at({i}) = scalesBiasesH.at({i, 0});
+        // Caffe2 represents offsets (bias) using float, while Glow uses
+        // int32_t.
+        dataOffsetsH.at({i}) = static_cast<int32_t>(scalesBiasesH.at({i, 1}));
+      }
+
+      // Now create the actual node.
+      if (isWeighted) {
+        node = G_.createRowwiseQuantizedSparseLengthsWeightedSum(
+            opName, dataC, dataScales, dataOffsets, weights, indices, lengths);
+      } else {
+        node = G_.createRowwiseQuantizedSparseLengthsSum(
+            opName, dataC, dataScales, dataOffsets, indices, lengths);
+      }
     }
+
     RETURN_IF_ERR(addNodeAsOutput(op, node));
     return llvm::Error::success();
   }
diff --git a/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_init_net.pbtxt b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_init_net.pbtxt
new file mode 100644
index 0000000000..a5a43f6db9
--- /dev/null
+++ b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_init_net.pbtxt
@@ -0,0 +1,22 @@
+name: "fused_rowwise_quantized_sparse_lengths_sum_init_net_test"
+op {
+  output: "data"
+  type: "Int8GivenTensorFill"
+  arg {
+    name: "shape"
+    ints: 3
+    ints: 10
+  }
+  arg {
+    name: "values"
+    s: "\324\377\116\263\032\273\200\200\200\103\254\377\216\364\332\274\200\200\200\103\311\377\004\235\067\274\200\200\200\103"
+  }
+  arg {
+    name: "Y_zero_point"
+    i: 0
+  }
+  arg {
+    name: "Y_scale"
+    f: 0.0
+  }
+}
diff --git a/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_predict_net.pbtxt b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_predict_net.pbtxt
new file mode 100644
index 0000000000..b186e19a01
--- /dev/null
+++ b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_predict_net.pbtxt
@@ -0,0 +1,12 @@
+name: "fused_rowwise_quantized_sparse_lengths_sum_predict_net_test"
+op {
+  input: "data"
+  input: "indices"
+  input: "lengths"
+  output: "result"
+  name: ""
+  type: "SparseLengthsSumFused8BitRowwise"
+}
+external_input: "indices"
+external_input: "lengths"
+external_output: "result"
diff --git a/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_init_net.pbtxt b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_init_net.pbtxt
new file mode 100644
index 0000000000..b3d4438a09
--- /dev/null
+++ b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_init_net.pbtxt
@@ -0,0 +1,41 @@
+name: "fused_rowwise_quantized_sparse_lengths_weighted_sum_init_net_test"
+op {
+  output: "data"
+  type: "Int8GivenTensorFill"
+  arg {
+    name: "shape"
+    ints: 3
+    ints: 9
+  }
+  arg {
+    name: "values"
+    s: "\377\001\000\200\274\200\200\200\103\000\001\000\200\273\200\200\176\302\377\121\120\320\275\200\200\200\103"
+  }
+  arg {
+    name: "Y_zero_point"
+    i: 0
+  }
+  arg {
+    name: "Y_scale"
+    f: 0.0
+  }
+}
+op {
+  output: "weights"
+  type: "GivenTensorFill"
+  arg {
+    name: "shape"
+    ints: 8
+  }
+  arg {
+    name: "values"
+    floats: 3.0
+    floats: 1.0
+    floats: 0.0
+    floats: 0.0
+    floats: 0.0
+    floats: 0.0
+    floats: 2.0
+    floats: -0.5
+  }
+}
diff --git a/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_predict_net.pbtxt b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_predict_net.pbtxt
new file mode 100644
index 0000000000..1e9e774962
--- /dev/null
+++ b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_predict_net.pbtxt
@@ -0,0 +1,13 @@
+name: "fused_rowwise_quantized_sparse_lengths_weighted_sum_predict_net_test"
+op {
+  input: "data"
+  input: "weights"
+  input: "indices"
+  input: "lengths"
+  output: "result"
+  name: ""
+  type: "SparseLengthsWeightedSumFused8BitRowwise"
+}
+external_input: "indices"
+external_input: "lengths"
+external_output: "result"
diff --git a/tests/unittests/Caffe2ImporterTest.cpp b/tests/unittests/Caffe2ImporterTest.cpp
index 809c5f740f..33022dcbd6 100644
--- a/tests/unittests/Caffe2ImporterTest.cpp
+++ b/tests/unittests/Caffe2ImporterTest.cpp
@@ -1897,3 +1897,196 @@ TEST(caffe2, SparseLengthsSum8BitsRowwise) {
 
   EXPECT_TRUE(expected.isEqual(result, 0.02f));
 }
+
+/// Test loading SparseLengthsWeightedSumFused8BitRowwise. This is created as a
+/// RowwiseQuantizedSparseLengthsWeightedSumNode. The following inputs/outputs
+/// are used/expected for this test. Note that the DATA input is
+/// rowwise-quantized in the init_net proto.
+///    DATA  =   [[2.0, -0.5, 13]]
+///    WEIGHTS = [3, 1, 0, 0, 0, 0, 2, -0.5]
+///    INDICES = [1, 0, 2, 0, 1, 2, 2, 0]
+///    LENGTHS = [3, 0, 3, 2]
+///    OUTPUT =  [[0.5, 0, 0, 25]]
+TEST(caffe2, SparseLengthsWeightedSumFused8BitRowwise) {
+  ExecutionEngine EE{BackendKind::Interpreter};
+  auto &mod = EE.getModule();
+  Function *F = mod.createFunction("main");
+
+  std::string NetDescFilename(
+      GLOW_DATA_PATH
+      "tests/models/caffe2Models/"
+      "fused_rowwise_quantized_sparse_lengths_weighted_sum_predict_net.pbtxt");
+  std::string NetWeightFilename(
+      GLOW_DATA_PATH
+      "tests/models/caffe2Models/"
+      "fused_rowwise_quantized_sparse_lengths_weighted_sum_init_net.pbtxt");
+
+  Placeholder *output, *indices, *lengths;
+  Context ctx;
+
+  TypeRef indicesType = F->getParent()->uniqueType(ElemKind::Int64ITy, {8});
+  TypeRef lengthsType = F->getParent()->uniqueType(ElemKind::Int32ITy, {4});
+
+  // Destroy the loader after the graph is loaded since the following execution
+  // will not depend on anyting from the loader.
+  {
+    Caffe2ModelLoader caffe2LD(NetDescFilename, NetWeightFilename,
+                               {"indices", "lengths"},
+                               {indicesType, lengthsType}, *F);
+
+    indices = llvm::dyn_cast<Placeholder>(
+        EXIT_ON_ERR(caffe2LD.getNodeValueByName("indices")));
+    lengths = llvm::dyn_cast<Placeholder>(
+        EXIT_ON_ERR(caffe2LD.getNodeValueByName("lengths")));
+    output = EXIT_ON_ERR(caffe2LD.getSingleOutput());
+  }
+
+  ASSERT_TRUE(indices);
+  ASSERT_TRUE(lengths);
+
+  ctx.allocate(indices)->getHandle<int64_t>() = {
+      1, 0, 2, 0, 1, 2, 2, 0,
+  };
+  ctx.allocate(lengths)->getHandle<int32_t>() = {
+      3,
+      0,
+      3,
+      2,
+  };
+
+  // High level check on the content of the graph. We have 1 rowwise-quantized
+  // SLWS and 1 save.
+  EXPECT_EQ(F->getNodes().size(), 2);
+  SaveNode *saveNode = getSaveNodeFromDest(output);
+  FusedRowwiseQuantizedSparseLengthsWeightedSumNode *FRWQSLWS =
+      llvm::dyn_cast<FusedRowwiseQuantizedSparseLengthsWeightedSumNode>(
+          saveNode->getInput().getNode());
+  ASSERT_TRUE(FRWQSLWS);
+  // Check that the weights input is a Constant node.
+  Constant *weights =
+      llvm::dyn_cast<Constant>(FRWQSLWS->getWeights().getNode());
+  ASSERT_TRUE(weights);
+  // Check that the data input is a Constant node with expected ElemKind.
+  Constant *data = llvm::dyn_cast<Constant>(FRWQSLWS->getData().getNode());
+  ASSERT_TRUE(data);
+  EXPECT_TRUE(data->getElementType() == ElemKind::Int8FusedQTy);
+
+  // We have 3 placeholders: 1 for save, and then indices and lengths.
+  EXPECT_EQ(mod.getPlaceholders().size(), 3);
+
+  // We have 2 constants: data and weights.
+  EXPECT_EQ(mod.getConstants().size(), 2);
+
+  EE.compile(CompilationMode::Infer, F);
+
+  EE.run(ctx);
+
+  Tensor &result = *ctx.get(output);
+  Tensor expected(ElemKind::FloatTy, {4, 1});
+  expected.getHandle() = {
+      0.5,
+      0,
+      0,
+      25,
+  };
+
+  EXPECT_TRUE(expected.isEqual(result, 0.02f));
+}
+
+/// Test loading SparseLengthsSumFused8BitRowwise. This is created as a
+/// RowwiseQuantizedSparseLengthsWeightedSumNode. The following inputs/outputs
+/// are used/expected for this test. Note that the DATA input is
+/// rowwise-quantized in the init_net proto.
+///    DATA  = [
+///        [1.0, 1.2],
+///        [2.3, 3.4],
+///        [4.5, 5.7],
+///    ]
+///    INDICES = [2, 0, 1, 2, 0, 0, 0, 0]
+///    LENGTHS = [2, 0, 2, 1, 3]
+///    OUTPUT = [
+///        [5.5, 6.9],
+///        [0.0, 0.0],
+///        [6.8, 9.1],
+///        [1.0, 1.2],
+///        [3.0, 3.6],
+///    ]
+TEST(caffe2, SparseLengthsSumFused8BitRowwise) {
+  ExecutionEngine EE{BackendKind::Interpreter};
+  auto &mod = EE.getModule();
+  Function *F = mod.createFunction("main");
+
+  std::string NetDescFilename(
+      GLOW_DATA_PATH
+      "tests/models/caffe2Models/"
+      "fused_rowwise_quantized_sparse_lengths_sum_predict_net.pbtxt");
+  std::string NetWeightFilename(
+      GLOW_DATA_PATH
+      "tests/models/caffe2Models/"
+      "fused_rowwise_quantized_sparse_lengths_sum_init_net.pbtxt");
+
+  Placeholder *output, *indices, *lengths;
+  Context ctx;
+
+  TypeRef indicesType = F->getParent()->uniqueType(ElemKind::Int64ITy, {8});
+  TypeRef lengthsType = F->getParent()->uniqueType(ElemKind::Int32ITy, {5});
+
+  // Destroy the loader after the graph is loaded since the following execution
+  // will not depend on anyting from the loader.
+  {
+    Caffe2ModelLoader caffe2LD(NetDescFilename, NetWeightFilename,
+                               {"indices", "lengths"},
+                               {indicesType, lengthsType}, *F);
+
+    indices = llvm::dyn_cast<Placeholder>(
+        EXIT_ON_ERR(caffe2LD.getNodeValueByName("indices")));
+    lengths = llvm::dyn_cast<Placeholder>(
+        EXIT_ON_ERR(caffe2LD.getNodeValueByName("lengths")));
+    output = EXIT_ON_ERR(caffe2LD.getSingleOutput());
+  }
+
+  ASSERT_TRUE(indices);
+  ASSERT_TRUE(lengths);
+
+  ctx.allocate(indices)->getHandle<int64_t>() = {
+      2, 0, 1, 2, 0, 0, 0, 0,
+  };
+  ctx.allocate(lengths)->getHandle<int32_t>() = {
+      2, 0, 2, 1, 3,
+  };
+
+  // High level check on the content of the graph. We have 1 rowwise-quantized
+  // SLWS (which implements SLS), 1 Splat for the weights, and 1 save.
+  EXPECT_EQ(F->getNodes().size(), 3);
+  SaveNode *saveNode = getSaveNodeFromDest(output);
+  FusedRowwiseQuantizedSparseLengthsWeightedSumNode *FRWQSLS =
+      llvm::dyn_cast<FusedRowwiseQuantizedSparseLengthsWeightedSumNode>(
+          saveNode->getInput().getNode());
+  ASSERT_TRUE(FRWQSLS);
+  SplatNode *splatNode =
+      llvm::dyn_cast<SplatNode>(FRWQSLS->getWeights().getNode());
+  ASSERT_TRUE(splatNode);
+  EXPECT_EQ(splatNode->getValue(), 1.0f);
+  // Check that the data input is a Constant node with expected ElemKind.
+  Constant *data = llvm::dyn_cast<Constant>(FRWQSLS->getData().getNode());
+  ASSERT_TRUE(data);
+  EXPECT_TRUE(data->getElementType() == ElemKind::Int8FusedQTy);
+
+  // We have 3 placeholders: 1 for save, and then indices and lengths.
+  EXPECT_EQ(mod.getPlaceholders().size(), 3);
+
+  // We have 1 constant: data.
+  EXPECT_EQ(mod.getConstants().size(), 1);
+
+  EE.compile(CompilationMode::Infer, F);
+
+  EE.run(ctx);
+
+  Tensor &result = *ctx.get(output);
+  Tensor expected(ElemKind::FloatTy, {5, 2});
+  expected.getHandle() = {
+      5.5f, 6.9f, 0.0f, 0.0f, 6.8f, 9.1f, 1.0f, 1.2f, 3.0f, 3.6f,
+  };
+
+  EXPECT_TRUE(expected.isEqual(result, 0.02f));
+}

From a7e795f85bbfb77d83538ff20ccd24a076700f93 Mon Sep 17 00:00:00 2001
From: Jordan Fix <1198212+jfix71@users.noreply.github.com>
Date: Fri, 8 Feb 2019 19:44:03 -0800
Subject: [PATCH 7/8] Add documentation for ElemKind::Int8FusedQTy

---
 docs/Quantization.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/Quantization.md b/docs/Quantization.md
index e862b729eb..4d16f5c198 100644
--- a/docs/Quantization.md
+++ b/docs/Quantization.md
@@ -212,3 +212,16 @@ Row-wise quantized SparseLengthsWeightedSum is also supported. Similar to the
 above, we compute scales and offsets per row, to be used with the `Data` input
 for the `RowwiseQuantizedSparseLengthsSumNode`. Scales and Offsets are inputs to
 the node. Output of this node is float, matching the Caffe2 implementation.
+
+### Fused Row-wise Quantization
+
+For some backends it may be beneficial to keep each row's scales and offsets
+fused inline with the data. Caffe2 implements nodes with fused storage, such as
+[SparseLengthsWeightedSum](https://caffe2.ai/docs/operators-catalogue.html#sparselengthsweightedsumfused8bitrowwise). Glow
+supports such fused Nodes/Instructions, for example
+`FusedRowwiseQuantizedSparseLengthsWeightedSum`. The `ElemKind` of fused tensors
+is `Int8FusedQTy`. Tensors with `Int8FusedQTy` are 2-dimensional, and have an
+extra 8 columns for each row. The first extra 4 bytes are the float scale of the
+row, and the second extra 4 bytes are the in32_t offset. Note that similar to
+normal row-wise quantized tensors, they use a dummy scale and offset in the
+Type.

From 3f0740fb2aee3aca58617badd53a955c76c26b2e Mon Sep 17 00:00:00 2001
From: Jordan Fix <1198212+jfix71@users.noreply.github.com>
Date: Mon, 11 Feb 2019 17:18:09 -0800
Subject: [PATCH 8/8] [Tensor] Change init so it doesn't touch fused
 scale/offset.

---
 include/glow/Base/Tensor.h      | 36 +++++++++++++--
 lib/Base/Tensor.cpp             |  9 +++-
 tests/unittests/TensorsTest.cpp | 77 +++++++++++++++++++++++++++++++++
 3 files changed, 117 insertions(+), 5 deletions(-)

diff --git a/include/glow/Base/Tensor.h b/include/glow/Base/Tensor.h
index 5d89793c74..5c1bd9b75a 100644
--- a/include/glow/Base/Tensor.h
+++ b/include/glow/Base/Tensor.h
@@ -107,6 +107,18 @@ class Tensor final {
       auto *data = reinterpret_cast<int32_t *>(getData());
       std::fill(&data[0], &data[0] + size(), (int32_t)type_.getOffset());
     } break;
+    case ElemKind::Int8FusedQTy: {
+      assert(dims().size() == 2 && "Fused tensor must be 2-dimensional.");
+      assert(dims()[1] > 8 && "Fused tensor must have more than 8 columns.");
+      const size_t width = dims()[1];
+      auto *data = reinterpret_cast<int8_t *>(getData());
+      for (size_t i = 0, e = dims()[0]; i < e; i++) {
+        int8_t *scaleOffsetPtr = &data[(i + 1) * width] - 8;
+        int32_t offset;
+        memcpy(&offset, scaleOffsetPtr + 4, 4);
+        std::fill(&data[i * width], scaleOffsetPtr, (int8_t)offset);
+      }
+    } break;
     default:
       // Non-quantized tensors are set to 0.
       std::fill(&getData()[0], &getData()[0] + size() * type_.getElementSize(),
@@ -174,8 +186,9 @@ class Tensor final {
   Tensor &operator=(const Tensor &other) = delete;
 
   /// Initialize the content of the tensor using the \p init method. The value
-  /// \p val is the initialization parameter. \p PRNG is used to generate
-  /// random numbers.
+  /// \p val is the initialization parameter. \p PRNG is used to generate random
+  /// numbers. Note that if the tensor's kind is Int8FusedQTy, then the fused
+  /// scaled/offsets will not be modified.
   void init(InitKind init, float val, PseudoRNG &PRNG);
 
   /// \returns unowned tensor using the same data buffer as the current tensor
@@ -717,8 +730,23 @@ template <class ElemTy> class Handle final {
     assert(filterSize > 0 && "invalid filter size");
     double scale = std::sqrt(3.0 / double(filterSize));
     std::uniform_real_distribution<> dist(-scale, scale);
-    for (auto &e : *this) {
-      e = dist(PRNG);
+    switch (getElementType()) {
+    default: {
+      for (auto &e : *this) {
+        e = dist(PRNG);
+      }
+      return;
+    }
+    case ElemKind::Int8FusedQTy: {
+      assert(dims().size() == 2 && "Fused tensor must be 2-dimensional.");
+      assert(dims()[1] > 8 && "Fused tensor must have more than 8 columns.");
+      for (size_t i = 0, e = dims()[0]; i < e; i++) {
+        for (size_t j = 0, f = dims()[1] - 8; j < f; j++) {
+          at({i, j}) = dist(PRNG);
+        }
+      }
+      return;
+    }
     }
   }
 
diff --git a/lib/Base/Tensor.cpp b/lib/Base/Tensor.cpp
index e441a648eb..11659c8574 100644
--- a/lib/Base/Tensor.cpp
+++ b/lib/Base/Tensor.cpp
@@ -423,7 +423,14 @@ void Tensor::init(InitKind init, float val, PseudoRNG &PRNG) {
       break;
     }
     case ElemKind::Int8FusedQTy: {
-      getHandle<int8_t>().clear(val);
+      assert(dims().size() == 2 && "Fused tensor must be 2-dimensional.");
+      assert(dims()[1] > 8 && "Fused tensor must have more than 8 columns.");
+      auto H = getHandle<int8_t>();
+      for (size_t i = 0; i < dims()[0]; i++) {
+        for (size_t j = 0, f = dims()[1] - 8; j < f; j++) {
+          H.at({i, j}) = val;
+        }
+      }
       break;
     }
     }
diff --git a/tests/unittests/TensorsTest.cpp b/tests/unittests/TensorsTest.cpp
index 6a6d6f2c3b..8803c1e753 100644
--- a/tests/unittests/TensorsTest.cpp
+++ b/tests/unittests/TensorsTest.cpp
@@ -791,3 +791,80 @@ TEST(Tensor, insertSlice) {
                             3.0f, 4.0f, 0.0f, 0.0f, 0.0f, 0.0f};
   EXPECT_TRUE(big.isEqual(expected));
 }
+
+/// Check that after initializing a fused tensor to zero that the scale and
+/// offset are not changed and that the values for each row are set to that
+/// row's offset.
+TEST(Tensor, initZeroFused) {
+  Tensor T(ElemKind::Int8FusedQTy, {10, 10}, 0.0, 0);
+  auto TH = T.getHandle<int8_t>();
+  TH.clear(127);
+  for (size_t i = 0; i < 10; i++) {
+    for (size_t j = 2; j < 10; j++) {
+      // Set 6 due to endianess when loading the int32_t offset.
+      if (j == 6) {
+        TH.at({i, j}) = i + 100;
+      } else {
+        TH.at({i, j}) = 0;
+      }
+    }
+  }
+  PseudoRNG PRNG;
+  T.init(Tensor::InitKind::Zero, 1, PRNG);
+  for (size_t i = 0; i < 10; i++) {
+    for (size_t j = 0; j < 10; j++) {
+      // Now check that both the offset and the values are correct, and that all
+      // other values are still 0.
+      if (j < 2 || j == 6) {
+        EXPECT_EQ(TH.at({i, j}), i + 100);
+      } else {
+        EXPECT_EQ(TH.at({i, j}), 0);
+      }
+    }
+  }
+}
+
+/// Check that initializing a fused tensor with Xavier that the scale and offset
+/// are not changed.
+TEST(Tensor, initXavierFused) {
+  Tensor T(ElemKind::Int8FusedQTy, {10, 10}, 0.0, 0);
+  PseudoRNG PRNG;
+  auto TH = T.getHandle<int8_t>();
+  for (size_t i = 0; i < 10; i++) {
+    for (size_t j = 0; j < 10; j++) {
+      TH.at({i, j}) = i * 10 + j;
+    }
+  }
+  T.init(Tensor::InitKind::Xavier, 1, PRNG);
+  for (size_t i = 0; i < 10; i++) {
+    for (size_t j = 2; j < 10; j++) {
+      // Check that the scales/offsets are unchanged.
+      EXPECT_EQ(TH.at({i, j}), i * 10 + j);
+    }
+  }
+}
+
+/// Check that initializing a fused tensor with Broadcast that the scale and
+/// offset are not changed, and broadcast value is set correctly.
+TEST(Tensor, initBroadcastFused) {
+  Tensor T(ElemKind::Int8FusedQTy, {10, 10}, 0.0, 0);
+  auto TH = T.getHandle<int8_t>();
+  for (size_t i = 0; i < 10; i++) {
+    for (size_t j = 0; j < 10; j++) {
+      TH.at({i, j}) = i * 10 + j;
+    }
+  }
+  PseudoRNG PRNG;
+  T.init(Tensor::InitKind::Broadcast, 5, PRNG);
+  for (size_t i = 0; i < 10; i++) {
+    for (size_t j = 0; j < 10; j++) {
+      // Check that the scales/offsets are unchanged, and that the broadcast
+      // value is everywhere else.
+      if (j < 2) {
+        EXPECT_EQ(TH.at({i, j}), 5);
+      } else {
+        EXPECT_EQ(TH.at({i, j}), i * 10 + j);
+      }
+    }
+  }
+}