From 8c6229fe21319e5b591405b3a9da14af9850e035 Mon Sep 17 00:00:00 2001 From: Jordan Fix <1198212+jfix71@users.noreply.github.com> Date: Fri, 8 Feb 2019 15:34:49 -0800 Subject: [PATCH 1/8] [Graph] Fix comments for RWQ-SLWS/SLS --- include/glow/Graph/Graph.h | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/include/glow/Graph/Graph.h b/include/glow/Graph/Graph.h index 4ae088972c..5413619495 100644 --- a/include/glow/Graph/Graph.h +++ b/include/glow/Graph/Graph.h @@ -573,18 +573,19 @@ class Function final : public Named { NodeValue data, NodeValue weights, NodeValue indices, NodeValue lengths); - /// Create a node, performing SparseLengthsSum operation, using rowwise - /// quantization for the input data. Gathers slices of the outer-most - /// dimension of Data indexed by Indices vector, and then accumulates them - /// into len(Lengths) entries: first Lengths[0] slices are aggregated to - /// Result[0], next Lengths[1] slices are aggregated to Result[1], - /// etc. I.e. sum(Lengths) must be equal to len(Indices). + /// Creates and \returns a node of \p name, performing the SparseLengthsSum + /// operation, using rowwise quantization for the input \p data with the \p + /// scales and \p offsets as separate input tensors. Gathers slices of the + /// outer-most dimension of data indexed by the \p indices vector, and then + /// accumulates them into len(\p lengths) entries: first Lengths[0] slices are + /// aggregated to Result[0], next Lengths[1] slices are aggregated to + /// Result[1], etc. I.e. sum(Lengths) must be equal to len(Indices). RowwiseQuantizedSparseLengthsWeightedSumNode * - createRowwiseQuantizedSparseLengthsWeightedSum( - llvm::StringRef name, Constant *data, Constant *scales, Constant *offsets, - NodeValue weights, NodeValue indices, NodeValue lengths); + createRowwiseQuantizedSparseLengthsSum(llvm::StringRef name, Constant *data, + Constant *scales, Constant *offsets, + NodeValue indices, NodeValue lengths); - /// Same as \ref createRowwiseQuantizedSparseLengthsWeightedSum(), but expects + /// Same as \ref createRowwiseQuantizedSparseLengthsSum(), but expects /// float input \p data, which is rowwise-quantized internally. RowwiseQuantizedSparseLengthsWeightedSumNode * createRowwiseQuantizedSparseLengthsSum(llvm::StringRef name, Tensor &data, @@ -593,11 +594,11 @@ class Function final : public Named { /// Same as \ref createRowwiseQuantizedSparseLengthsSum(), but i-th slice is /// multiplied by weights[i]. len(weights) must be equal to len(indices). RowwiseQuantizedSparseLengthsWeightedSumNode * - createRowwiseQuantizedSparseLengthsSum(llvm::StringRef name, Constant *data, - Constant *scales, Constant *offsets, - NodeValue indices, NodeValue lengths); + createRowwiseQuantizedSparseLengthsWeightedSum( + llvm::StringRef name, Constant *data, Constant *scales, Constant *offsets, + NodeValue weights, NodeValue indices, NodeValue lengths); - /// Same as \ref createRowwiseQuantizedSparseLengthsSum(), but expects + /// Same as \ref createRowwiseQuantizedSparseLengthsWeightedSum(), but expects /// float input \p data, which is rowwise-quantized internally. RowwiseQuantizedSparseLengthsWeightedSumNode * createRowwiseQuantizedSparseLengthsWeightedSum(llvm::StringRef name, From 914d10f7041fd2a7e6042fc96a59a8a40c430428 Mon Sep 17 00:00:00 2001 From: Jordan Fix <1198212+jfix71@users.noreply.github.com> Date: Fri, 8 Feb 2019 10:50:07 -0800 Subject: [PATCH 2/8] Add Int8FusedQTy --- include/glow/Base/Tensor.h | 16 ++++++++++++++++ include/glow/Base/Type.h | 24 +++++++++++++++--------- lib/Backends/CPU/LLVMIRGen.cpp | 4 ++++ lib/Base/Tensor.cpp | 15 +++++++++++++++ 4 files changed, 50 insertions(+), 9 deletions(-) diff --git a/include/glow/Base/Tensor.h b/include/glow/Base/Tensor.h index 8618c9bfae..5d89793c74 100644 --- a/include/glow/Base/Tensor.h +++ b/include/glow/Base/Tensor.h @@ -288,6 +288,17 @@ class Tensor final { return false; } + // For now, make sure that either both or neither of the tensors have + // Int8FusedQTy. While it is possible for an Int8QTy tensor to equal a + // Int8FusedQTy tensor if the Int8FusedQTy tensor has the same scale/offset + // on all of its rows, and that scale/offset match that of the Int8QTy, we + // do not support checking this for now. + assert(((getElementType() == ElemKind::Int8FusedQTy && + other.getElementType() == ElemKind::Int8FusedQTy) || + (getElementType() != ElemKind::Int8FusedQTy && + other.getElementType() != ElemKind::Int8FusedQTy)) && + "Int8FusedQTy only supports comparing against same ElemKind."); + switch (getElementType()) { case ElemKind::FloatTy: return isEqualImpl(other, allowedError); @@ -315,6 +326,11 @@ class Tensor final { return isEqualImpl(other, allowedError); case ElemKind::Int64ITy: return isEqualImpl(other, allowedError); + // Note: We can use isEqualImpl() here because the scales/offsets will be + // compared as if they were data, so we will return false if any rowwise + // scale/offset do not match. + case ElemKind::Int8FusedQTy: + return isEqualImpl(other, allowedError); } // This is to make compiler happy. It can never reach this point as switch diff --git a/include/glow/Base/Type.h b/include/glow/Base/Type.h index 9b6f2ab729..1b7622f885 100644 --- a/include/glow/Base/Type.h +++ b/include/glow/Base/Type.h @@ -185,13 +185,14 @@ inline bool operator==(const ShapeNCHW &LHS, const ShapeNCHW &RHS) { /// An enum representing the type used by the elements of a tensor. The types of /// Handles for these tensors should match the element kind. enum class ElemKind : unsigned char { - FloatTy, // 32-bit float type (float) - Float16Ty, // 16-bit float type (half, fp16) - Int8QTy, // 8-bit quantized type (int8_t) - Int16QTy, // 16-bit quantized type (int16_t) - Int32QTy, // 32-bit quantized type (int32_t) - Int32ITy, // 32-bit index type (int32_t) - Int64ITy, // 64-bit index type (int64_t) + FloatTy, // 32-bit float type (float) + Float16Ty, // 16-bit float type (half, fp16) + Int8QTy, // 8-bit quantized type (int8_t) + Int16QTy, // 16-bit quantized type (int16_t) + Int32QTy, // 32-bit quantized type (int32_t) + Int32ITy, // 32-bit index type (int32_t) + Int64ITy, // 64-bit index type (int64_t) + Int8FusedQTy, // 8-bit quantized type with fused scale/offset (int8_t) }; /// A class that represents a type of a tensor. @@ -360,6 +361,8 @@ struct Type final { return std::is_same::value; case ElemKind::Int64ITy: return std::is_same::value; + case ElemKind::Int8FusedQTy: + return std::is_same::value; } GLOW_UNREACHABLE("Invalid type."); } @@ -368,7 +371,8 @@ struct Type final { bool isQuantizedType() const { return elementType_ == ElemKind::Int8QTy || elementType_ == ElemKind::Int16QTy || - elementType_ == ElemKind::Int32QTy; + elementType_ == ElemKind::Int32QTy || + elementType_ == ElemKind::Int8FusedQTy; } /// \returns true if the type of this Tensor is one of the floating point @@ -401,6 +405,8 @@ struct Type final { return sizeof(int32_t); case ElemKind::Int64ITy: return sizeof(int64_t); + case ElemKind::Int8FusedQTy: + return sizeof(int8_t); } GLOW_UNREACHABLE("Invalid type."); } @@ -413,7 +419,7 @@ struct Type final { /// \return the textual name of the element \p Ty. static llvm::StringRef getElementName(ElemKind Ty) { static const char *names[] = { - "float", "float16", "i8", "i16", "i32", "index32", "index64", + "float", "float16", "i8", "i16", "i32", "index32", "index64", "i8fused", }; return names[(int)Ty]; } diff --git a/lib/Backends/CPU/LLVMIRGen.cpp b/lib/Backends/CPU/LLVMIRGen.cpp index dd1857a258..1816b381cf 100644 --- a/lib/Backends/CPU/LLVMIRGen.cpp +++ b/lib/Backends/CPU/LLVMIRGen.cpp @@ -237,6 +237,8 @@ llvm::Type *LLVMIRGen::getElementType(llvm::IRBuilder<> &builder, return builder.getInt32Ty(); case ElemKind::Int32ITy: return builder.getInt32Ty(); + case ElemKind::Int8FusedQTy: + return builder.getInt8Ty(); } return nullptr; } @@ -469,6 +471,8 @@ llvm::Value *LLVMIRGen::emitConst(llvm::IRBuilder<> &builder, float val, return builder.getInt32(static_cast(val)); case ElemKind::Int32ITy: return builder.getInt32(static_cast(val)); + case ElemKind::Int8FusedQTy: + return builder.getInt8(static_cast(val)); } llvm_unreachable("Unknown element type"); } diff --git a/lib/Base/Tensor.cpp b/lib/Base/Tensor.cpp index 6dc8c8e8cf..e441a648eb 100644 --- a/lib/Base/Tensor.cpp +++ b/lib/Base/Tensor.cpp @@ -283,6 +283,8 @@ void glow::dumpAsciiImpl(const Tensor *T, llvm::raw_ostream &os) { return dumpAsciiGenericImpl(T->getHandle(), os); case ElemKind::Int64ITy: return dumpAsciiGenericImpl(T->getHandle(), os); + case ElemKind::Int8FusedQTy: + return dumpAsciiGenericImpl(T->getHandle(), os); } } @@ -304,6 +306,8 @@ void glow::dumpImpl(const Tensor *T, llvm::raw_ostream &os) { return dumpGenericImpl(T->getHandle(), os); case ElemKind::Int64ITy: return dumpGenericImpl(T->getHandle(), os); + case ElemKind::Int8FusedQTy: + return dumpGenericImpl(T->getHandle(), os); } } @@ -368,6 +372,9 @@ void glow::genericTranspose(const Tensor *src, Tensor *dest, transposeSelectImpl(srcH, destH, shuffle); return; } + case ElemKind::Int8FusedQTy: { + llvm_unreachable("Transposing Int8FusedQTy is unsupported."); + } } } @@ -415,6 +422,10 @@ void Tensor::init(InitKind init, float val, PseudoRNG &PRNG) { getHandle().clear(val); break; } + case ElemKind::Int8FusedQTy: { + getHandle().clear(val); + break; + } } break; } @@ -449,6 +460,10 @@ void Tensor::init(InitKind init, float val, PseudoRNG &PRNG) { getHandle().initXavier(val, PRNG); break; } + case ElemKind::Int8FusedQTy: { + getHandle().initXavier(val, PRNG); + break; + } } break; } From 7c40989911ef53414b684c19d6b454e4a64001b6 Mon Sep 17 00:00:00 2001 From: Jordan Fix <1198212+jfix71@users.noreply.github.com> Date: Fri, 8 Feb 2019 10:50:07 -0800 Subject: [PATCH 3/8] [Quantization/Base] Add method for quantizing a fused tensor --- include/glow/Quantization/Base/Base.h | 9 +++++++ lib/Quantization/Base/Base.cpp | 35 +++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/include/glow/Quantization/Base/Base.h b/include/glow/Quantization/Base/Base.h index abbe2ef85d..e628fd031c 100644 --- a/include/glow/Quantization/Base/Base.h +++ b/include/glow/Quantization/Base/Base.h @@ -136,6 +136,15 @@ std::vector createMapping(TypeRef inTy, TypeRef outTy, void tensorRowwiseQuantization(const Tensor &input, Tensor &output, Tensor &scales, Tensor &offsets); +/// Fused-rowwise quantize the tensor \p input. Scales and offsets are generated +/// from each row of \p input. \p output is tensor of the same shape as input +/// but with 8 extra columns for storing fused scales (4 bytes (columns) for +/// float) and offset (4 bytes (columns) for int32_t). +/// \pre input.dims().size() == 2 +/// \pre output.dims().size() == 2 +/// \pre input.dims()[1] + 8 == output.dims()[1] +void tensorFusedRowwiseQuantization(const Tensor &input, Tensor &output); + } // namespace quantization } // namespace glow diff --git a/lib/Quantization/Base/Base.cpp b/lib/Quantization/Base/Base.cpp index a09b9f33be..bd94cf1ff6 100644 --- a/lib/Quantization/Base/Base.cpp +++ b/lib/Quantization/Base/Base.cpp @@ -393,5 +393,40 @@ void tensorRowwiseQuantization(const Tensor &input, Tensor &output, } } +void tensorFusedRowwiseQuantization(const Tensor &input, Tensor &output) { + // We are fusing the float scale and int32_t offset onto the end of each + // row. Thus input and output must both be 2 dimensional, with output having 8 + // extra columns for 4 bytes for float scale, and 4 bytes for int32_t offset. + assert(input.dims().size() == 2 && output.dims().size() == 2 && + "Input and output must be 2 dimensional."); + assert(input.dims()[1] + 8 == output.dims()[1] && + "Output must have 8 more columns than input."); + + const size_t outWidth = output.dims()[1]; + char *dataBasePtr = output.getUnsafePtr(); + + auto srcH = input.getHandle(); + auto destH = output.getHandle(); + for (size_t i = 0, e = input.dims()[0]; i < e; i++) { + auto slice = srcH.extractSlice(i); + auto rSrc = slice.getHandle(); + auto res = rSrc.minMaxArg(); + float min = rSrc.raw(res.first); + float max = rSrc.raw(res.second); + + // Set the dest's actual data based on the calculated scale/offset. + TensorQuantizationParams qParams = + chooseQuantizationParams(min, max, quantization::Schema::Asymmetric); + for (size_t j = 0, f = input.dims()[1]; j < f; j++) { + destH.at({i, j}) = quantization::quantize(srcH.at({i, j}), qParams); + } + + // Now set the scale/offset at the end of each row. + char *currRowScaleOffsetPtr = dataBasePtr + (i + 1) * outWidth - 8; + memcpy(currRowScaleOffsetPtr, &qParams.scale, 4); + memcpy(currRowScaleOffsetPtr + 4, &qParams.offset, 4); + } +} + } // namespace quantization } // namespace glow From c00113fa1e32ec40a413c6f5f53d2bcadc7e610b Mon Sep 17 00:00:00 2001 From: Jordan Fix <1198212+jfix71@users.noreply.github.com> Date: Fri, 8 Feb 2019 15:46:12 -0800 Subject: [PATCH 4/8] [New Operator] Add Interpreter support for FusedRowwiseQuantizedSparseLengthsWeightedSumNode and FusedRowwiseQuantizedSparseLengthsSumNode --- include/glow/Graph/Graph.h | 39 +++++++ lib/Backends/Interpreter/InterpreterNodes.cpp | 54 +++++++++ lib/Graph/Graph.cpp | 73 ++++++++++++ lib/Graph/Nodes.cpp | 32 ++++++ tests/unittests/OperatorTest.cpp | 108 ++++++++++++++++++ tools/ClassGen/InstrGen.cpp | 17 +++ tools/ClassGen/NodeGen.cpp | 19 +++ 7 files changed, 342 insertions(+) diff --git a/include/glow/Graph/Graph.h b/include/glow/Graph/Graph.h index 5413619495..5ba49262fa 100644 --- a/include/glow/Graph/Graph.h +++ b/include/glow/Graph/Graph.h @@ -607,6 +607,45 @@ class Function final : public Named { NodeValue indices, NodeValue lengths); + /// Creates and \returns a node of \p name, performing the SparseLengthsSum + /// operation, using fused rowwise quantization for the input \p data wherein + /// the scales and offsets are fused inline with each row of data. \p data + /// must be ElemKind::Int8FusedQTy. Gathers slices of the outer-most dimension + /// of data indexed by the \p indices vector, and then accumulates them into + /// len(\p lengths) entries: first Lengths[0] slices are aggregated to + /// Result[0], next Lengths[1] slices are aggregated to Result[1], + /// etc. I.e. sum(Lengths) must be equal to len(Indices). + FusedRowwiseQuantizedSparseLengthsWeightedSumNode * + createFusedRowwiseQuantizedSparseLengthsSum(llvm::StringRef name, + Constant *data, NodeValue indices, + NodeValue lengths); + + /// Same as \ref createFusedRowwiseQuantizedSparseLengthsSum(), but expects + /// float input \p data, which is rowwise-quantized and fused internally. + FusedRowwiseQuantizedSparseLengthsWeightedSumNode * + createFusedRowwiseQuantizedSparseLengthsSum(llvm::StringRef name, + Tensor &data, NodeValue indices, + NodeValue lengths); + + /// Same as \ref createFusedRowwiseQuantizedSparseLengthsSum(), but i-th slice + /// is multiplied by weights[i]. len(weights) must be equal to len(indices). + FusedRowwiseQuantizedSparseLengthsWeightedSumNode * + createFusedRowwiseQuantizedSparseLengthsWeightedSum(llvm::StringRef name, + Tensor &data, + NodeValue weights, + NodeValue indices, + NodeValue lengths); + + /// Same as \ref createFusedRowwiseQuantizedSparseLengthsWeightedSum(), but + /// expects float input \p data, which is rowwise-quantized and fused + /// internally. + FusedRowwiseQuantizedSparseLengthsWeightedSumNode * + createFusedRowwiseQuantizedSparseLengthsWeightedSum(llvm::StringRef name, + Constant *data, + NodeValue weights, + NodeValue indices, + NodeValue lengths); + /// Given a vector of segment lengths, calculates offsets of each segment and /// packs them next to the lengths. For the input vector of length N the /// output is a Nx2 matrix with (offset, lengths) packaged for each segment. diff --git a/lib/Backends/Interpreter/InterpreterNodes.cpp b/lib/Backends/Interpreter/InterpreterNodes.cpp index 88a8ab6aeb..acc4fe5602 100644 --- a/lib/Backends/Interpreter/InterpreterNodes.cpp +++ b/lib/Backends/Interpreter/InterpreterNodes.cpp @@ -2200,6 +2200,60 @@ void BoundInterpreterFunction::fwdRowwiseQuantizedSparseLengthsWeightedSumInst( } } +void BoundInterpreterFunction:: + fwdFusedRowwiseQuantizedSparseLengthsWeightedSumInst( + const FusedRowwiseQuantizedSparseLengthsWeightedSumInst *I) { + auto *out = getTensor(I->getDest()); + auto *data = getTensor(I->getData()); + auto *weights = getTensor(I->getWeights()); + auto *indices = getTensor(I->getIndices()); + auto *lengths = getTensor(I->getLengths()); + + out->zero(); + + auto IH = indices->getHandle(); + auto LH = lengths->getHandle(); + + size_t segments = lengths->dims()[0]; + size_t totalLength = 0; + for (size_t i = 0; i < segments; i++) { + totalLength += LH.raw(i); + } + assert(totalLength == indices->dims()[0] && + "sum(Lengths) must be equal to len(Indices)"); + + const size_t inLineSize = data->size() / data->dims()[0]; + const size_t outLineSize = out->size() / out->dims()[0]; + + auto DH = data->getHandle(); + auto WH = weights->getHandle(); + auto OH = out->getHandle(); + + size_t curIdx = 0; + for (size_t i = 0; i < segments; i++) { + for (size_t j = 0, e = LH.raw(i); j < e; j++) { + const float weight = WH.raw(curIdx); + const size_t rowIdx = IH.raw(curIdx++); + size_t offsetIn = rowIdx * inLineSize; + size_t offsetOut = i * outLineSize; + // Get the scale and offset from the row; go to the current row and offset + // into it up until the last 8 bytes. Use memcpy to get the values out to + // avoid alignment issues of accessing 4-byte values. + const char *currRowScaleOffsetPtr = + data->getUnsafePtr() + offsetIn + inLineSize - 8; + float scale; + int32_t offset; + memcpy(&scale, currRowScaleOffsetPtr, sizeof(float)); + memcpy(&offset, currRowScaleOffsetPtr + 4, sizeof(int32_t)); + for (size_t k = 0; k < outLineSize; k++) { + float d = quantization::dequantize( + DH.raw(offsetIn++), TensorQuantizationParams{scale, offset}); + OH.raw(offsetOut++) += d * weight; + } + } + } +} + void BoundInterpreterFunction::fwdLengthsToRangesInst( const LengthsToRangesInst *I) { auto ranges = getTensor(I->getDest())->getHandle(); diff --git a/lib/Graph/Graph.cpp b/lib/Graph/Graph.cpp index 8f5b9984fe..5ff3ccda6b 100644 --- a/lib/Graph/Graph.cpp +++ b/lib/Graph/Graph.cpp @@ -1447,6 +1447,79 @@ Function::createRowwiseQuantizedSparseLengthsSum(llvm::StringRef name, this, name, data, ones, indices, lengths); } +FusedRowwiseQuantizedSparseLengthsWeightedSumNode * +Function::createFusedRowwiseQuantizedSparseLengthsWeightedSum( + llvm::StringRef name, Constant *data, NodeValue weights, NodeValue indices, + NodeValue lengths) { + auto inDims = data->dims(); + ShapeVector outDims(inDims.begin(), inDims.end()); + outDims[0] = lengths.dims()[0]; + // The output column count is the same as the input column count, but without + // the extra 8 bytes for the fused scale/offset, as the output is not + // Int8FusedQTy. + outDims[1] -= 8; + auto outTy = getParent()->uniqueType(ElemKind::FloatTy, outDims); + return addNode(new FusedRowwiseQuantizedSparseLengthsWeightedSumNode( + name, outTy, data, weights, indices, lengths)); +} + +FusedRowwiseQuantizedSparseLengthsWeightedSumNode * +Function::createFusedRowwiseQuantizedSparseLengthsSum(llvm::StringRef name, + Constant *data, + NodeValue indices, + NodeValue lengths) { + auto ty = getParent()->uniqueType(ElemKind::FloatTy, {indices.dims()[0]}); + auto ones = createSplat(name.str() + ".ones", ty, 1.0); + return createFusedRowwiseQuantizedSparseLengthsWeightedSum(name, data, ones, + indices, lengths); +} + +/// Helper to create a RowwiseQuantizedSparseLengthsWeightedSumNode in the +/// Function \p F with \p name, using \ data, \p weights, \p indices, and \p +/// lengths as inputs. The provided float data in \p Tensor is rowwise +/// quantized, creating Constants for the rowwise quantized data as well as +/// Scales and Offsets, in the Module containing \p F. +static FusedRowwiseQuantizedSparseLengthsWeightedSumNode * +quantizeDataAndCreateFusedRowwiseQuantizedSparseLengthsWeightedSum( + Function *F, llvm::StringRef name, Tensor &data, NodeValue weights, + NodeValue indices, NodeValue lengths) { + // For fused rowwise quantization, we must have a two-dimensional input. If + // passed in a single dimensional data Tensor then add an extra dimension. + const auto fDims = flattenCdr(data.dims()); + Tensor fData = data.getUnowned({fDims.first, fDims.second}); + + // Note: In rwqData, we are using a quantized type, however the scale/offset + // are set to dummy values 0.0/0. This is because the actually used + // scale/offset are fused inline with each row. Also, we expand the second + // dimension to include space for the scale/offset, each 4 bytes + // (float/int32_t). + Constant *rwqData = F->getParent()->createConstant( + ElemKind::Int8FusedQTy, {fDims.first, fDims.second + 8}, 0.0, 0, "data"); + + quantization::tensorFusedRowwiseQuantization(fData, rwqData->getPayload()); + return F->createFusedRowwiseQuantizedSparseLengthsWeightedSum( + name, rwqData, weights, indices, lengths); +} + +FusedRowwiseQuantizedSparseLengthsWeightedSumNode * +Function::createFusedRowwiseQuantizedSparseLengthsWeightedSum( + llvm::StringRef name, Tensor &data, NodeValue weights, NodeValue indices, + NodeValue lengths) { + return quantizeDataAndCreateFusedRowwiseQuantizedSparseLengthsWeightedSum( + this, name, data, weights, indices, lengths); +} + +FusedRowwiseQuantizedSparseLengthsWeightedSumNode * +Function::createFusedRowwiseQuantizedSparseLengthsSum(llvm::StringRef name, + Tensor &data, + NodeValue indices, + NodeValue lengths) { + auto ty = getParent()->uniqueType(ElemKind::FloatTy, {indices.dims()[0]}); + auto ones = createSplat(name.str() + ".ones", ty, 1.0); + return quantizeDataAndCreateFusedRowwiseQuantizedSparseLengthsWeightedSum( + this, name, data, ones, indices, lengths); +} + LengthsToRangesNode *Function::createLengthsToRanges(llvm::StringRef name, NodeValue lengths) { ShapeVector outDims({lengths.dims()[0], 2}); diff --git a/lib/Graph/Nodes.cpp b/lib/Graph/Nodes.cpp index ecf723b379..dd020e90f2 100644 --- a/lib/Graph/Nodes.cpp +++ b/lib/Graph/Nodes.cpp @@ -747,6 +747,38 @@ bool RowwiseQuantizedSparseLengthsWeightedSumNode::verify() const { return isValid; } +bool FusedRowwiseQuantizedSparseLengthsWeightedSumNode::verify() const { + bool isValid = checkType(getResult(), ElemKind::FloatTy, this); + isValid &= checkType(getData(), ElemKind::Int8FusedQTy, this); + isValid &= checkType(getWeights(), ElemKind::FloatTy, this); + isValid &= checkType(getIndices(), ElemKind::Int64ITy, this); + isValid &= checkType(getLengths(), ElemKind::Int32ITy, this); + isValid &= expectCompareTrue("Indices must be a 1D vector", + getIndices().dims().size(), size_t(1), this); + isValid &= expectCompareTrue("Lengths must be a 1D vector", + getLengths().dims().size(), size_t(1), this); + isValid &= expectCompareTrue("Weights must be a 1D vector", + getWeights().dims().size(), size_t(1), this); + isValid &= + expectCompareTrue("Weights and Indices must have the same size", + getWeights().dims()[0], getIndices().dims()[0], this); + isValid &= expectCompareTrue("Data must be 2 dimensional.", + getData().dims().size(), size_t(2), this); + isValid &= expectCompareTrue("Data must have more than 8 columns.", + getData().dims()[1], size_t(8), this, + CompareOperatorGreaterEqual()); + isValid &= expectCompareTrue("Result must be 2 dimensional.", + getResult().dims().size(), size_t(2), this); + // Wrap this in isValid to prevent potential segfault if the result is + // incorrectly shaped. + if (isValid) { + isValid &= expectCompareTrue( + "Result output shape should have second dim as 8 less than Data.", + getResult().dims()[1] + 8, getData().dims()[1], this); + } + return isValid; +} + bool LengthsToRangesNode::verify() const { bool isValid = checkType(getResult(), getLengths().getElementType(), this); isValid &= checkType(getLengths(), ElemKind::Int32ITy, this); diff --git a/tests/unittests/OperatorTest.cpp b/tests/unittests/OperatorTest.cpp index 9453cd596d..f9fd1561d2 100644 --- a/tests/unittests/OperatorTest.cpp +++ b/tests/unittests/OperatorTest.cpp @@ -4145,6 +4145,114 @@ TEST_P(InterpAndCPU, RowwiseQuantizedSparseLengthsSum) { EXPECT_TRUE(expected.isEqual(result, 0.02)); } +TEST_P(InterpAndCPU, FusedRowwiseQuantizedSparseLengthsWeightedSum) { + /* + DATA = [[2.0, -0.5, 13]] + WEIGHTS = [3, 1, 0, 0, 0, 0, 2, -0.5] + INDICES = [1, 0, 2, 0, 1, 2, 2, 0] + LENGTHS = [3, 0, 3, 2] + OUTPUT = [[0.5, 0, 0, 25]] + */ + Tensor data(ElemKind::FloatTy, {3, 1}); + data.getHandle() = { + 2.0, + -0.5, + 13, + }; + + Constant *weights = mod_.createConstant(ElemKind::FloatTy, {8}, "weights"); + weights->getPayload().getHandle() = { + 3., 1., 0., 0., 0., 0., 2., -0.5, + }; + + Placeholder *indices = + mod_.createPlaceholder(ElemKind::Int64ITy, {8}, "indices", + /* isTrainable */ false); + Placeholder *lengths = + mod_.createPlaceholder(ElemKind::Int32ITy, {4}, "lengths", + /* isTrainable */ false); + + ctx_.allocate(indices)->getHandle() = { + 1, 0, 2, 0, 1, 2, 2, 0, + }; + ctx_.allocate(lengths)->getHandle() = { + 3, + 0, + 3, + 2, + }; + + auto *R = F_->createFusedRowwiseQuantizedSparseLengthsWeightedSum( + "RQSLWS", data, weights, indices, lengths); + SaveNode *S = F_->createSave("save", R); + ctx_.allocate(S->getPlaceholder()); + + EE_.compile(CompilationMode::Infer, F_); + EE_.run(ctx_); + + Tensor &result = *ctx_.get(S->getPlaceholder()); + Tensor expected(ElemKind::FloatTy, {4, 1}); + expected.getHandle() = { + 0.5, + 0, + 0, + 25, + }; + + EXPECT_TRUE(expected.isEqual(result, 0.02)); +} + +TEST_P(InterpAndCPU, FusedRowwiseQuantizedSparseLengthsSum) { + /* + DATA = [ + [1.0, 1.2], + [2.3, 3.4], + [4.5, 5.7], + ] + INDICES = [2, 0, 1, 2, 0, 0, 0, 0] + LENGTHS = [2, 0, 2, 1, 3] + OUTPUT = [ + [5.5, 6.9], + [0.0, 0.0], + [6.8, 9.1], + [1.0, 1.2], + [3.0, 3.6], + ] + */ + Tensor data(ElemKind::FloatTy, {3, 2}); + data.getHandle() = { + 1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.7f, + }; + + Placeholder *indices = mod_.createPlaceholder( + ElemKind::Int64ITy, {8}, "indices", /* isTrainable */ false); + Placeholder *lengths = mod_.createPlaceholder( + ElemKind::Int32ITy, {5}, "lengths", /* isTrainable */ false); + + ctx_.allocate(indices)->getHandle() = { + 2, 0, 1, 2, 0, 0, 0, 0, + }; + ctx_.allocate(lengths)->getHandle() = { + 2, 0, 2, 1, 3, + }; + + auto *R = F_->createFusedRowwiseQuantizedSparseLengthsSum("RQSLWS", data, + indices, lengths); + SaveNode *S = F_->createSave("save", R); + ctx_.allocate(S->getPlaceholder()); + + EE_.compile(CompilationMode::Infer, F_); + EE_.run(ctx_); + + Tensor &result = *ctx_.get(S->getPlaceholder()); + Tensor expected(ElemKind::FloatTy, {5, 2}); + expected.getHandle() = { + 5.5f, 6.9f, 0.0f, 0.0f, 6.8f, 9.1f, 1.0f, 1.2f, 3.0f, 3.6f, + }; + + EXPECT_TRUE(expected.isEqual(result, 0.02)); +} + TEST_P(InterpAndCPU, SparseToDense) { // Create and initialize inputs. Make input 3D to make sure // multidimensional values are handled properly. diff --git a/tools/ClassGen/InstrGen.cpp b/tools/ClassGen/InstrGen.cpp index 4d62ed71f9..c915a6c392 100644 --- a/tools/ClassGen/InstrGen.cpp +++ b/tools/ClassGen/InstrGen.cpp @@ -250,6 +250,23 @@ int main(int argc, char **argv) { {"Lengths", "ElemKind::Int32ITy"}) .autoVerify(VerifyKind::SameShape, {"Weights", "Indices"}); + BB.newInstr("FusedRowwiseQuantizedSparseLengthsWeightedSum") + .addOperand("Dest", OperandKind::Out) + .addOperand("Data", OperandKind::In) + .addOperand("Weights", OperandKind::In) + .addOperand("Indices", OperandKind::In) + .addOperand("Lengths", OperandKind::In) + .autoIRGen() + .autoVerify(VerifyKind::SameElementType, {"Dest", "ElemKind::FloatTy"}) + .autoVerify(VerifyKind::SameElementType, + {"Data", "ElemKind::Int8FusedQTy"}) + .autoVerify(VerifyKind::SameElementType, {"Weights", "ElemKind::FloatTy"}) + .autoVerify(VerifyKind::SameElementType, + {"Indices", "ElemKind::Int64ITy"}) + .autoVerify(VerifyKind::SameElementType, + {"Lengths", "ElemKind::Int32ITy"}) + .autoVerify(VerifyKind::SameShape, {"Weights", "Indices"}); + BB.newInstr("LengthsToRanges") .addOperand("Dest", OperandKind::Out) .addOperand("Lengths", OperandKind::In) diff --git a/tools/ClassGen/NodeGen.cpp b/tools/ClassGen/NodeGen.cpp index 6b8f61ad5c..1cea7aeb21 100644 --- a/tools/ClassGen/NodeGen.cpp +++ b/tools/ClassGen/NodeGen.cpp @@ -355,6 +355,25 @@ int main(int argc, char **argv) { "data is rowwise-quantized, where the Scales and Offsets " "are 1D tensors of length equal to the first dim of Data."); + BB.newNode("FusedRowwiseQuantizedSparseLengthsWeightedSum") + .addInput("Data") + .addInput("Weights") + .addInput("Indices") + .addInput("Lengths") + .addResultFromCtorArg() + .setDocstring("Gathers slices of the outer-most dimension of Data " + "indexed by Indices vector, and then accumulates them into " + "len(Lengths) entries: first Lengths[0] slices are " + "aggregated to Result[0], next Lengths[1] slices are " + "aggregated to Result[1], etc. I.e. sum(Lengths) must be " + "equal to len(Indices). Before doing aggregation, each " + "individual slice is scaled by its weight: Result[0] = " + "Weights[0] * Slice(0) + Weights[1] * Slice(1) + ... " + "It implies that len(Weights) == len(Indices). The input " + "data is fused rowwise-quantized, where the Scales and " + "Offsets are appended to the end of each row. Thus, Data " + "must be a two-dimensional tensor."); + BB.newNode("LengthsToRanges") .addInput("Lengths") .addResultFromCtorArg() From 23182ed7cf75aebd539f53b364e12f8bf2341b9c Mon Sep 17 00:00:00 2001 From: Jordan Fix <1198212+jfix71@users.noreply.github.com> Date: Fri, 8 Feb 2019 16:16:41 -0800 Subject: [PATCH 5/8] [New Operator] Add CPU support for FusedRowwiseQuantizedSparseLengthsWeightedSumNode and FusedRowwiseQuantizedSparseLengthsSumNode --- lib/Backends/CPU/LLVMIRGen.cpp | 26 ++++++++++++++++++++++++++ lib/Backends/CPU/libjit/libjit.cpp | 24 ++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/lib/Backends/CPU/LLVMIRGen.cpp b/lib/Backends/CPU/LLVMIRGen.cpp index 1816b381cf..e826823887 100644 --- a/lib/Backends/CPU/LLVMIRGen.cpp +++ b/lib/Backends/CPU/LLVMIRGen.cpp @@ -326,6 +326,9 @@ llvm::Value *LLVMIRGen::emitValueAddress(llvm::IRBuilder<> &builder, case ElemKind::Int32ITy: T = llvm::Type::getInt32PtrTy(ctx_); break; + case ElemKind::Int8FusedQTy: + T = llvm::Type::getInt8PtrTy(ctx_); + break; default: llvm_unreachable("Unimplemented"); break; @@ -2322,6 +2325,29 @@ void LLVMIRGen::generateLLVMIRForInstr(llvm::IRBuilder<> &builder, break; } + case Kinded::Kind::FusedRowwiseQuantizedSparseLengthsWeightedSumInstKind: { + auto *N = cast(I); + auto *dest = N->getDest(); + auto *data = N->getData(); + auto *weights = N->getWeights(); + auto *indices = N->getIndices(); + auto *lengths = N->getLengths(); + auto *destPtr = emitValueAddress(builder, dest); + auto *dataPtr = emitValueAddress(builder, data); + auto *weightsPtr = emitValueAddress(builder, weights); + auto *indicesPtr = emitValueAddress(builder, indices); + auto *lengthsPtr = emitValueAddress(builder, lengths); + auto *segments = emitConstSizeT(builder, lengths->dims()[0]); + auto *inLineSize = emitConstSizeT(builder, data->size() / data->dims()[0]); + auto *outLineSize = emitConstSizeT(builder, dest->size() / dest->dims()[0]); + auto *F = getFunction("fused_rowwise_quantized_sparse_lengths_weighted_sum", + dest->getElementType()); + createCall(builder, F, + {destPtr, dataPtr, weightsPtr, indicesPtr, lengthsPtr, segments, + inLineSize, outLineSize}); + break; + } + case Kinded::Kind::SparseToDenseInstKind: { auto *STDI = llvm::cast(I); auto *indices = STDI->getIndices(); diff --git a/lib/Backends/CPU/libjit/libjit.cpp b/lib/Backends/CPU/libjit/libjit.cpp index 84297cfeff..088ee26bb7 100644 --- a/lib/Backends/CPU/libjit/libjit.cpp +++ b/lib/Backends/CPU/libjit/libjit.cpp @@ -1057,6 +1057,30 @@ void libjit_rowwise_quantized_sparse_lengths_weighted_sum_f( } } +void libjit_fused_rowwise_quantized_sparse_lengths_weighted_sum_f( + float *dest, int8_t *data, float *weights, size_t *indices, + int32_t *lengths, size_t segments, size_t inLineSize, size_t outLineSize) { + memset(dest, 0, segments * outLineSize * sizeof(float)); + size_t curIndex = 0; + for (size_t i = 0; i < segments; i++) { + for (int32_t j = 0, e = lengths[i]; j < e; j++) { + const float weight = weights[curIndex]; + const size_t line = indices[curIndex]; + const int8_t *currRowScaleOffsetPtr = + data + ((line + 1) * inLineSize) - 8; + float scale; + int32_t offset; + memcpy(&scale, currRowScaleOffsetPtr, sizeof(float)); + memcpy(&offset, currRowScaleOffsetPtr + 4, sizeof(int32_t)); + for (size_t k = 0; k < outLineSize; k++) { + const float fData = scale * (data[line * inLineSize + k] - offset); + dest[i * outLineSize + k] += weight * fData; + } + curIndex++; + } + } +} + void libjit_sparse_to_dense_f(float *dest, const size_t *indices, const float *values, size_t numIndices, size_t destSize, size_t valueSize) { From 013791ecae0b94119d90a23d0f42c3e1c3c549b1 Mon Sep 17 00:00:00 2001 From: Jordan Fix <1198212+jfix71@users.noreply.github.com> Date: Fri, 8 Feb 2019 16:24:18 -0800 Subject: [PATCH 6/8] [Caffe2ImporterTest] Add fused RWQ-SLWS/SLS tests --- lib/Importer/Caffe2ModelLoader.cpp | 133 ++++++++---- ...uantized_sparse_lengths_sum_init_net.pbtxt | 22 ++ ...tized_sparse_lengths_sum_predict_net.pbtxt | 12 ++ ...sparse_lengths_weighted_sum_init_net.pbtxt | 41 ++++ ...rse_lengths_weighted_sum_predict_net.pbtxt | 13 ++ tests/unittests/Caffe2ImporterTest.cpp | 193 ++++++++++++++++++ 6 files changed, 373 insertions(+), 41 deletions(-) create mode 100644 tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_init_net.pbtxt create mode 100644 tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_predict_net.pbtxt create mode 100644 tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_init_net.pbtxt create mode 100644 tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_predict_net.pbtxt diff --git a/lib/Importer/Caffe2ModelLoader.cpp b/lib/Importer/Caffe2ModelLoader.cpp index 301b0f2ff2..c8a1d54424 100644 --- a/lib/Importer/Caffe2ModelLoader.cpp +++ b/lib/Importer/Caffe2ModelLoader.cpp @@ -874,13 +874,21 @@ llvm::Error Caffe2ModelLoader::loadOperator(const caffe2::OperatorDef &op) { } if (typeName == "SparseLengthsWeightedSum8BitsRowwise" || - typeName == "SparseLengthsSum8BitsRowwise") { - // If SparseLengthsWeightedSum8BitsRowwise, then the weights are the second - // input and so we need to shift indices/lengths/scalesBiases. + typeName == "SparseLengthsSum8BitsRowwise" || + typeName == "SparseLengthsWeightedSumFused8BitRowwise" || + typeName == "SparseLengthsSumFused8BitRowwise") { + const bool isWeighted = + typeName == "SparseLengthsWeightedSum8BitsRowwise" || + typeName == "SparseLengthsWeightedSumFused8BitRowwise"; + const bool isFused = + typeName == "SparseLengthsWeightedSumFused8BitRowwise" || + typeName == "SparseLengthsSumFused8BitRowwise"; + // If weighted, then the weights are the second input and so we need to + // shift indices/lengths/scalesBiases. size_t indicesIdx = 1; size_t lengthsIdx = 2; size_t scalesBiasesIdx = 3; - if (typeName == "SparseLengthsWeightedSum8BitsRowwise") { + if (isWeighted) { indicesIdx++; lengthsIdx++; scalesBiasesIdx++; @@ -889,60 +897,103 @@ llvm::Error Caffe2ModelLoader::loadOperator(const caffe2::OperatorDef &op) { NodeValue data; ASSIGN_VALUE_OR_RETURN_ERR(data, getNodeValueOrCreateConstantByName(op.input(0))); + NodeValue weights; + if (isWeighted) { + ASSIGN_VALUE_OR_RETURN_ERR( + weights, getNodeValueOrCreateConstantByName(op.input(1))); + } NodeValue indices; ASSIGN_VALUE_OR_RETURN_ERR( indices, getNodeValueOrCreateConstantByName(op.input(indicesIdx))); NodeValue lengths; ASSIGN_VALUE_OR_RETURN_ERR( lengths, getNodeValueOrCreateConstantByName(op.input(lengthsIdx))); - NodeValue scalesBiases; - ASSIGN_VALUE_OR_RETURN_ERR(scalesBiases, getNodeValueOrCreateConstantByName( - op.input(scalesBiasesIdx))); - - Constant *scalesBiasesC = llvm::dyn_cast(scalesBiases); - RETURN_ERR_IF_NOT(scalesBiasesC, "scales_biases must be Constant."); Constant *dataC = llvm::dyn_cast(data); - RETURN_ERR_IF_NOT(dataC->getElementType() == ElemKind::Int8QTy, - "Data must be Int8QTy."); const size_t numRows = data.dims()[0]; // Make sure all the shapes make sense. RETURN_ERR_IF_NOT(lengths.dims().size() == 1, "lengths must be a vector."); RETURN_ERR_IF_NOT(indices.dims().size() == 1, "indices must be a vector."); - RETURN_ERR_IF_NOT(scalesBiases.dims().size() == 2, - "scale_bias has to be a matrix."); - RETURN_ERR_IF_NOT(scalesBiases.dims()[0] == numRows, - "scale_bias must have the same number of rows as data."); - RETURN_ERR_IF_NOT(scalesBiases.dims()[1] == 2, - "Second dim of scale_bias has to be equal to 2."); - - // Now strip out the scales and biases into their own tensors. - Constant *dataScales = G_.getParent()->createConstant( - ElemKind::FloatTy, {numRows}, "dataScales"); - Constant *dataOffsets = G_.getParent()->createConstant( - ElemKind::Int32ITy, {numRows}, "dataOffsets"); - - auto dataScalesH = dataScales->getHandle(); - auto dataOffsetsH = dataOffsets->getHandle(); - auto scalesBiasesH = scalesBiasesC->getHandle(); - for (size_t i = 0, e = numRows; i < e; i++) { - dataScalesH.at({i}) = scalesBiasesH.at({i, 0}); - // Caffe2 represents offsets (bias) using float, while Glow uses int32_t. - dataOffsetsH.at({i}) = static_cast(scalesBiasesH.at({i, 1})); - } Node *node; - if (typeName == "SparseLengthsWeightedSum8BitsRowwise") { - NodeValue weights; - ASSIGN_VALUE_OR_RETURN_ERR( - weights, getNodeValueOrCreateConstantByName(op.input(1))); - node = G_.createRowwiseQuantizedSparseLengthsWeightedSum( - opName, dataC, dataScales, dataOffsets, weights, indices, lengths); + if (isFused) { + // There is no specific fused quantized type in Caffe2, so we will load + // Int8QTy. We then change it from Int8QTy to Int8FusedQTy here if + // necessary -- another user could have already changed it. + if (dataC->getElementType() != ElemKind::Int8FusedQTy) { + RETURN_ERR_IF_NOT(dataC->getElementType() == ElemKind::Int8QTy, + "Data must be Int8QTy."); + // Use dummy 0.0/0 as scale/offset, since the actual scales/offsets are + // fused inline with the data. + TypeRef fusedTy = G_.getParent()->uniqueType(ElemKind::Int8FusedQTy, + dataC->dims(), 0.0, 0); + dataC->setType(Storage::OutputIdx, fusedTy); + } + + // Caffe2 stores offsets as floats, whereas we want to use int32_t. + char *dataBasePtr = dataC->getPayload().getUnsafePtr(); + const size_t width = dataC->dims()[1]; + for (size_t i = 0, e = dataC->dims()[0]; i < e; ++i) { + // Must memcpy to the stack and back to avoid misaligned addresses. + char *currRowOffsetPtr = dataBasePtr + (i + 1) * width - 4; + float fOffset; + memcpy(&fOffset, currRowOffsetPtr, 4); + int32_t iOffset = static_cast(fOffset); + memcpy(currRowOffsetPtr, &iOffset, 4); + } + + // No other work to do, since the data is already loaded fused, so just + // create the new node with its inputs. + if (isWeighted) { + node = G_.createFusedRowwiseQuantizedSparseLengthsWeightedSum( + opName, dataC, weights, indices, lengths); + } else { + node = G_.createFusedRowwiseQuantizedSparseLengthsSum(opName, dataC, + indices, lengths); + } } else { - node = G_.createRowwiseQuantizedSparseLengthsSum( - opName, dataC, dataScales, dataOffsets, indices, lengths); + NodeValue scalesBiases; + ASSIGN_VALUE_OR_RETURN_ERR( + scalesBiases, + getNodeValueOrCreateConstantByName(op.input(scalesBiasesIdx))); + + Constant *scalesBiasesC = llvm::dyn_cast(scalesBiases); + RETURN_ERR_IF_NOT(scalesBiasesC, "scales_biases must be Constant."); + RETURN_ERR_IF_NOT(scalesBiases.dims().size() == 2, + "scale_bias has to be a matrix."); + RETURN_ERR_IF_NOT( + scalesBiases.dims()[0] == numRows, + "scale_bias must have the same number of rows as data."); + RETURN_ERR_IF_NOT(scalesBiases.dims()[1] == 2, + "Second dim of scale_bias has to be equal to 2."); + + // Now strip out the scales and biases into their own tensors. + Constant *dataScales = G_.getParent()->createConstant( + ElemKind::FloatTy, {numRows}, "dataScales"); + Constant *dataOffsets = G_.getParent()->createConstant( + ElemKind::Int32ITy, {numRows}, "dataOffsets"); + + auto dataScalesH = dataScales->getHandle(); + auto dataOffsetsH = dataOffsets->getHandle(); + auto scalesBiasesH = scalesBiasesC->getHandle(); + for (size_t i = 0, e = numRows; i < e; i++) { + dataScalesH.at({i}) = scalesBiasesH.at({i, 0}); + // Caffe2 represents offsets (bias) using float, while Glow uses + // int32_t. + dataOffsetsH.at({i}) = static_cast(scalesBiasesH.at({i, 1})); + } + + // Now create the actual node. + if (isWeighted) { + node = G_.createRowwiseQuantizedSparseLengthsWeightedSum( + opName, dataC, dataScales, dataOffsets, weights, indices, lengths); + } else { + node = G_.createRowwiseQuantizedSparseLengthsSum( + opName, dataC, dataScales, dataOffsets, indices, lengths); + } } + RETURN_IF_ERR(addNodeAsOutput(op, node)); return llvm::Error::success(); } diff --git a/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_init_net.pbtxt b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_init_net.pbtxt new file mode 100644 index 0000000000..a5a43f6db9 --- /dev/null +++ b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_init_net.pbtxt @@ -0,0 +1,22 @@ +name: "fused_rowwise_quantized_sparse_lengths_sum_init_net_test" +op { + output: "data" + type: "Int8GivenTensorFill" + arg { + name: "shape" + ints: 3 + ints: 10 + } + arg { + name: "values" + s: "\324\377\116\263\032\273\200\200\200\103\254\377\216\364\332\274\200\200\200\103\311\377\004\235\067\274\200\200\200\103" + } + arg { + name: "Y_zero_point" + i: 0 + } + arg { + name: "Y_scale" + f: 0.0 + } +} diff --git a/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_predict_net.pbtxt b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_predict_net.pbtxt new file mode 100644 index 0000000000..b186e19a01 --- /dev/null +++ b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_sum_predict_net.pbtxt @@ -0,0 +1,12 @@ +name: "fused_rowwise_quantized_sparse_lengths_sum_predict_net_test" +op { + input: "data" + input: "indices" + input: "lengths" + output: "result" + name: "" + type: "SparseLengthsSumFused8BitRowwise" +} +external_input: "indices" +external_input: "lengths" +external_output: "result" diff --git a/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_init_net.pbtxt b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_init_net.pbtxt new file mode 100644 index 0000000000..b3d4438a09 --- /dev/null +++ b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_init_net.pbtxt @@ -0,0 +1,41 @@ +name: "fused_rowwise_quantized_sparse_lengths_weighted_sum_init_net_test" +op { + output: "data" + type: "Int8GivenTensorFill" + arg { + name: "shape" + ints: 3 + ints: 9 + } + arg { + name: "values" + s: "\377\001\000\200\274\200\200\200\103\000\001\000\200\273\200\200\176\302\377\121\120\320\275\200\200\200\103" + } + arg { + name: "Y_zero_point" + i: 0 + } + arg { + name: "Y_scale" + f: 0.0 + } +} +op { + output: "weights" + type: "GivenTensorFill" + arg { + name: "shape" + ints: 8 + } + arg { + name: "values" + floats: 3.0 + floats: 1.0 + floats: 0.0 + floats: 0.0 + floats: 0.0 + floats: 0.0 + floats: 2.0 + floats: -0.5 + } +} diff --git a/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_predict_net.pbtxt b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_predict_net.pbtxt new file mode 100644 index 0000000000..1e9e774962 --- /dev/null +++ b/tests/models/caffe2Models/fused_rowwise_quantized_sparse_lengths_weighted_sum_predict_net.pbtxt @@ -0,0 +1,13 @@ +name: "fused_rowwise_quantized_sparse_lengths_weighted_sum_predict_net_test" +op { + input: "data" + input: "weights" + input: "indices" + input: "lengths" + output: "result" + name: "" + type: "SparseLengthsWeightedSumFused8BitRowwise" +} +external_input: "indices" +external_input: "lengths" +external_output: "result" diff --git a/tests/unittests/Caffe2ImporterTest.cpp b/tests/unittests/Caffe2ImporterTest.cpp index 809c5f740f..33022dcbd6 100644 --- a/tests/unittests/Caffe2ImporterTest.cpp +++ b/tests/unittests/Caffe2ImporterTest.cpp @@ -1897,3 +1897,196 @@ TEST(caffe2, SparseLengthsSum8BitsRowwise) { EXPECT_TRUE(expected.isEqual(result, 0.02f)); } + +/// Test loading SparseLengthsWeightedSumFused8BitRowwise. This is created as a +/// RowwiseQuantizedSparseLengthsWeightedSumNode. The following inputs/outputs +/// are used/expected for this test. Note that the DATA input is +/// rowwise-quantized in the init_net proto. +/// DATA = [[2.0, -0.5, 13]] +/// WEIGHTS = [3, 1, 0, 0, 0, 0, 2, -0.5] +/// INDICES = [1, 0, 2, 0, 1, 2, 2, 0] +/// LENGTHS = [3, 0, 3, 2] +/// OUTPUT = [[0.5, 0, 0, 25]] +TEST(caffe2, SparseLengthsWeightedSumFused8BitRowwise) { + ExecutionEngine EE{BackendKind::Interpreter}; + auto &mod = EE.getModule(); + Function *F = mod.createFunction("main"); + + std::string NetDescFilename( + GLOW_DATA_PATH + "tests/models/caffe2Models/" + "fused_rowwise_quantized_sparse_lengths_weighted_sum_predict_net.pbtxt"); + std::string NetWeightFilename( + GLOW_DATA_PATH + "tests/models/caffe2Models/" + "fused_rowwise_quantized_sparse_lengths_weighted_sum_init_net.pbtxt"); + + Placeholder *output, *indices, *lengths; + Context ctx; + + TypeRef indicesType = F->getParent()->uniqueType(ElemKind::Int64ITy, {8}); + TypeRef lengthsType = F->getParent()->uniqueType(ElemKind::Int32ITy, {4}); + + // Destroy the loader after the graph is loaded since the following execution + // will not depend on anyting from the loader. + { + Caffe2ModelLoader caffe2LD(NetDescFilename, NetWeightFilename, + {"indices", "lengths"}, + {indicesType, lengthsType}, *F); + + indices = llvm::dyn_cast( + EXIT_ON_ERR(caffe2LD.getNodeValueByName("indices"))); + lengths = llvm::dyn_cast( + EXIT_ON_ERR(caffe2LD.getNodeValueByName("lengths"))); + output = EXIT_ON_ERR(caffe2LD.getSingleOutput()); + } + + ASSERT_TRUE(indices); + ASSERT_TRUE(lengths); + + ctx.allocate(indices)->getHandle() = { + 1, 0, 2, 0, 1, 2, 2, 0, + }; + ctx.allocate(lengths)->getHandle() = { + 3, + 0, + 3, + 2, + }; + + // High level check on the content of the graph. We have 1 rowwise-quantized + // SLWS and 1 save. + EXPECT_EQ(F->getNodes().size(), 2); + SaveNode *saveNode = getSaveNodeFromDest(output); + FusedRowwiseQuantizedSparseLengthsWeightedSumNode *FRWQSLWS = + llvm::dyn_cast( + saveNode->getInput().getNode()); + ASSERT_TRUE(FRWQSLWS); + // Check that the weights input is a Constant node. + Constant *weights = + llvm::dyn_cast(FRWQSLWS->getWeights().getNode()); + ASSERT_TRUE(weights); + // Check that the data input is a Constant node with expected ElemKind. + Constant *data = llvm::dyn_cast(FRWQSLWS->getData().getNode()); + ASSERT_TRUE(data); + EXPECT_TRUE(data->getElementType() == ElemKind::Int8FusedQTy); + + // We have 3 placeholders: 1 for save, and then indices and lengths. + EXPECT_EQ(mod.getPlaceholders().size(), 3); + + // We have 2 constants: data and weights. + EXPECT_EQ(mod.getConstants().size(), 2); + + EE.compile(CompilationMode::Infer, F); + + EE.run(ctx); + + Tensor &result = *ctx.get(output); + Tensor expected(ElemKind::FloatTy, {4, 1}); + expected.getHandle() = { + 0.5, + 0, + 0, + 25, + }; + + EXPECT_TRUE(expected.isEqual(result, 0.02f)); +} + +/// Test loading SparseLengthsSumFused8BitRowwise. This is created as a +/// RowwiseQuantizedSparseLengthsWeightedSumNode. The following inputs/outputs +/// are used/expected for this test. Note that the DATA input is +/// rowwise-quantized in the init_net proto. +/// DATA = [ +/// [1.0, 1.2], +/// [2.3, 3.4], +/// [4.5, 5.7], +/// ] +/// INDICES = [2, 0, 1, 2, 0, 0, 0, 0] +/// LENGTHS = [2, 0, 2, 1, 3] +/// OUTPUT = [ +/// [5.5, 6.9], +/// [0.0, 0.0], +/// [6.8, 9.1], +/// [1.0, 1.2], +/// [3.0, 3.6], +/// ] +TEST(caffe2, SparseLengthsSumFused8BitRowwise) { + ExecutionEngine EE{BackendKind::Interpreter}; + auto &mod = EE.getModule(); + Function *F = mod.createFunction("main"); + + std::string NetDescFilename( + GLOW_DATA_PATH + "tests/models/caffe2Models/" + "fused_rowwise_quantized_sparse_lengths_sum_predict_net.pbtxt"); + std::string NetWeightFilename( + GLOW_DATA_PATH + "tests/models/caffe2Models/" + "fused_rowwise_quantized_sparse_lengths_sum_init_net.pbtxt"); + + Placeholder *output, *indices, *lengths; + Context ctx; + + TypeRef indicesType = F->getParent()->uniqueType(ElemKind::Int64ITy, {8}); + TypeRef lengthsType = F->getParent()->uniqueType(ElemKind::Int32ITy, {5}); + + // Destroy the loader after the graph is loaded since the following execution + // will not depend on anyting from the loader. + { + Caffe2ModelLoader caffe2LD(NetDescFilename, NetWeightFilename, + {"indices", "lengths"}, + {indicesType, lengthsType}, *F); + + indices = llvm::dyn_cast( + EXIT_ON_ERR(caffe2LD.getNodeValueByName("indices"))); + lengths = llvm::dyn_cast( + EXIT_ON_ERR(caffe2LD.getNodeValueByName("lengths"))); + output = EXIT_ON_ERR(caffe2LD.getSingleOutput()); + } + + ASSERT_TRUE(indices); + ASSERT_TRUE(lengths); + + ctx.allocate(indices)->getHandle() = { + 2, 0, 1, 2, 0, 0, 0, 0, + }; + ctx.allocate(lengths)->getHandle() = { + 2, 0, 2, 1, 3, + }; + + // High level check on the content of the graph. We have 1 rowwise-quantized + // SLWS (which implements SLS), 1 Splat for the weights, and 1 save. + EXPECT_EQ(F->getNodes().size(), 3); + SaveNode *saveNode = getSaveNodeFromDest(output); + FusedRowwiseQuantizedSparseLengthsWeightedSumNode *FRWQSLS = + llvm::dyn_cast( + saveNode->getInput().getNode()); + ASSERT_TRUE(FRWQSLS); + SplatNode *splatNode = + llvm::dyn_cast(FRWQSLS->getWeights().getNode()); + ASSERT_TRUE(splatNode); + EXPECT_EQ(splatNode->getValue(), 1.0f); + // Check that the data input is a Constant node with expected ElemKind. + Constant *data = llvm::dyn_cast(FRWQSLS->getData().getNode()); + ASSERT_TRUE(data); + EXPECT_TRUE(data->getElementType() == ElemKind::Int8FusedQTy); + + // We have 3 placeholders: 1 for save, and then indices and lengths. + EXPECT_EQ(mod.getPlaceholders().size(), 3); + + // We have 1 constant: data. + EXPECT_EQ(mod.getConstants().size(), 1); + + EE.compile(CompilationMode::Infer, F); + + EE.run(ctx); + + Tensor &result = *ctx.get(output); + Tensor expected(ElemKind::FloatTy, {5, 2}); + expected.getHandle() = { + 5.5f, 6.9f, 0.0f, 0.0f, 6.8f, 9.1f, 1.0f, 1.2f, 3.0f, 3.6f, + }; + + EXPECT_TRUE(expected.isEqual(result, 0.02f)); +} From a7e795f85bbfb77d83538ff20ccd24a076700f93 Mon Sep 17 00:00:00 2001 From: Jordan Fix <1198212+jfix71@users.noreply.github.com> Date: Fri, 8 Feb 2019 19:44:03 -0800 Subject: [PATCH 7/8] Add documentation for ElemKind::Int8FusedQTy --- docs/Quantization.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/Quantization.md b/docs/Quantization.md index e862b729eb..4d16f5c198 100644 --- a/docs/Quantization.md +++ b/docs/Quantization.md @@ -212,3 +212,16 @@ Row-wise quantized SparseLengthsWeightedSum is also supported. Similar to the above, we compute scales and offsets per row, to be used with the `Data` input for the `RowwiseQuantizedSparseLengthsSumNode`. Scales and Offsets are inputs to the node. Output of this node is float, matching the Caffe2 implementation. + +### Fused Row-wise Quantization + +For some backends it may be beneficial to keep each row's scales and offsets +fused inline with the data. Caffe2 implements nodes with fused storage, such as +[SparseLengthsWeightedSum](https://caffe2.ai/docs/operators-catalogue.html#sparselengthsweightedsumfused8bitrowwise). Glow +supports such fused Nodes/Instructions, for example +`FusedRowwiseQuantizedSparseLengthsWeightedSum`. The `ElemKind` of fused tensors +is `Int8FusedQTy`. Tensors with `Int8FusedQTy` are 2-dimensional, and have an +extra 8 columns for each row. The first extra 4 bytes are the float scale of the +row, and the second extra 4 bytes are the in32_t offset. Note that similar to +normal row-wise quantized tensors, they use a dummy scale and offset in the +Type. From 3f0740fb2aee3aca58617badd53a955c76c26b2e Mon Sep 17 00:00:00 2001 From: Jordan Fix <1198212+jfix71@users.noreply.github.com> Date: Mon, 11 Feb 2019 17:18:09 -0800 Subject: [PATCH 8/8] [Tensor] Change init so it doesn't touch fused scale/offset. --- include/glow/Base/Tensor.h | 36 +++++++++++++-- lib/Base/Tensor.cpp | 9 +++- tests/unittests/TensorsTest.cpp | 77 +++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 5 deletions(-) diff --git a/include/glow/Base/Tensor.h b/include/glow/Base/Tensor.h index 5d89793c74..5c1bd9b75a 100644 --- a/include/glow/Base/Tensor.h +++ b/include/glow/Base/Tensor.h @@ -107,6 +107,18 @@ class Tensor final { auto *data = reinterpret_cast(getData()); std::fill(&data[0], &data[0] + size(), (int32_t)type_.getOffset()); } break; + case ElemKind::Int8FusedQTy: { + assert(dims().size() == 2 && "Fused tensor must be 2-dimensional."); + assert(dims()[1] > 8 && "Fused tensor must have more than 8 columns."); + const size_t width = dims()[1]; + auto *data = reinterpret_cast(getData()); + for (size_t i = 0, e = dims()[0]; i < e; i++) { + int8_t *scaleOffsetPtr = &data[(i + 1) * width] - 8; + int32_t offset; + memcpy(&offset, scaleOffsetPtr + 4, 4); + std::fill(&data[i * width], scaleOffsetPtr, (int8_t)offset); + } + } break; default: // Non-quantized tensors are set to 0. std::fill(&getData()[0], &getData()[0] + size() * type_.getElementSize(), @@ -174,8 +186,9 @@ class Tensor final { Tensor &operator=(const Tensor &other) = delete; /// Initialize the content of the tensor using the \p init method. The value - /// \p val is the initialization parameter. \p PRNG is used to generate - /// random numbers. + /// \p val is the initialization parameter. \p PRNG is used to generate random + /// numbers. Note that if the tensor's kind is Int8FusedQTy, then the fused + /// scaled/offsets will not be modified. void init(InitKind init, float val, PseudoRNG &PRNG); /// \returns unowned tensor using the same data buffer as the current tensor @@ -717,8 +730,23 @@ template class Handle final { assert(filterSize > 0 && "invalid filter size"); double scale = std::sqrt(3.0 / double(filterSize)); std::uniform_real_distribution<> dist(-scale, scale); - for (auto &e : *this) { - e = dist(PRNG); + switch (getElementType()) { + default: { + for (auto &e : *this) { + e = dist(PRNG); + } + return; + } + case ElemKind::Int8FusedQTy: { + assert(dims().size() == 2 && "Fused tensor must be 2-dimensional."); + assert(dims()[1] > 8 && "Fused tensor must have more than 8 columns."); + for (size_t i = 0, e = dims()[0]; i < e; i++) { + for (size_t j = 0, f = dims()[1] - 8; j < f; j++) { + at({i, j}) = dist(PRNG); + } + } + return; + } } } diff --git a/lib/Base/Tensor.cpp b/lib/Base/Tensor.cpp index e441a648eb..11659c8574 100644 --- a/lib/Base/Tensor.cpp +++ b/lib/Base/Tensor.cpp @@ -423,7 +423,14 @@ void Tensor::init(InitKind init, float val, PseudoRNG &PRNG) { break; } case ElemKind::Int8FusedQTy: { - getHandle().clear(val); + assert(dims().size() == 2 && "Fused tensor must be 2-dimensional."); + assert(dims()[1] > 8 && "Fused tensor must have more than 8 columns."); + auto H = getHandle(); + for (size_t i = 0; i < dims()[0]; i++) { + for (size_t j = 0, f = dims()[1] - 8; j < f; j++) { + H.at({i, j}) = val; + } + } break; } } diff --git a/tests/unittests/TensorsTest.cpp b/tests/unittests/TensorsTest.cpp index 6a6d6f2c3b..8803c1e753 100644 --- a/tests/unittests/TensorsTest.cpp +++ b/tests/unittests/TensorsTest.cpp @@ -791,3 +791,80 @@ TEST(Tensor, insertSlice) { 3.0f, 4.0f, 0.0f, 0.0f, 0.0f, 0.0f}; EXPECT_TRUE(big.isEqual(expected)); } + +/// Check that after initializing a fused tensor to zero that the scale and +/// offset are not changed and that the values for each row are set to that +/// row's offset. +TEST(Tensor, initZeroFused) { + Tensor T(ElemKind::Int8FusedQTy, {10, 10}, 0.0, 0); + auto TH = T.getHandle(); + TH.clear(127); + for (size_t i = 0; i < 10; i++) { + for (size_t j = 2; j < 10; j++) { + // Set 6 due to endianess when loading the int32_t offset. + if (j == 6) { + TH.at({i, j}) = i + 100; + } else { + TH.at({i, j}) = 0; + } + } + } + PseudoRNG PRNG; + T.init(Tensor::InitKind::Zero, 1, PRNG); + for (size_t i = 0; i < 10; i++) { + for (size_t j = 0; j < 10; j++) { + // Now check that both the offset and the values are correct, and that all + // other values are still 0. + if (j < 2 || j == 6) { + EXPECT_EQ(TH.at({i, j}), i + 100); + } else { + EXPECT_EQ(TH.at({i, j}), 0); + } + } + } +} + +/// Check that initializing a fused tensor with Xavier that the scale and offset +/// are not changed. +TEST(Tensor, initXavierFused) { + Tensor T(ElemKind::Int8FusedQTy, {10, 10}, 0.0, 0); + PseudoRNG PRNG; + auto TH = T.getHandle(); + for (size_t i = 0; i < 10; i++) { + for (size_t j = 0; j < 10; j++) { + TH.at({i, j}) = i * 10 + j; + } + } + T.init(Tensor::InitKind::Xavier, 1, PRNG); + for (size_t i = 0; i < 10; i++) { + for (size_t j = 2; j < 10; j++) { + // Check that the scales/offsets are unchanged. + EXPECT_EQ(TH.at({i, j}), i * 10 + j); + } + } +} + +/// Check that initializing a fused tensor with Broadcast that the scale and +/// offset are not changed, and broadcast value is set correctly. +TEST(Tensor, initBroadcastFused) { + Tensor T(ElemKind::Int8FusedQTy, {10, 10}, 0.0, 0); + auto TH = T.getHandle(); + for (size_t i = 0; i < 10; i++) { + for (size_t j = 0; j < 10; j++) { + TH.at({i, j}) = i * 10 + j; + } + } + PseudoRNG PRNG; + T.init(Tensor::InitKind::Broadcast, 5, PRNG); + for (size_t i = 0; i < 10; i++) { + for (size_t j = 0; j < 10; j++) { + // Check that the scales/offsets are unchanged, and that the broadcast + // value is everywhere else. + if (j < 2) { + EXPECT_EQ(TH.at({i, j}), 5); + } else { + EXPECT_EQ(TH.at({i, j}), i * 10 + j); + } + } + } +}