Add quantized::fbgemm_linear_unpack operator for serialization (#97)

jianyuh · facebook-github-bot · commit 0f58d20fe43e · 2019-06-03T20:36:30.000-07:00
Summary: Pull Request resolved: pytorch/FBGEMM#97 Pull Request resolved: pytorch#20721 - FBGEMM: Add unpack function for PackBMatrix class: Unpack pmat buffer to the origin_buf (Used for the serialization to recover weight matrix). - PyTorch Quantizer: Add quantized::fbgemm_linear_unpack operator for serialization. Reviewed By: zafartahirov Differential Revision: D15314568 fbshipit-source-id: 12080c8887ce31dc849d23e132ae1766ac319407
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
@@ -13,7 +13,7 @@
 // of the A rows. The column offsets are needed for the asymmetric quantization
 // (affine quantization) of input matrix.
 // Note that in JIT mode we can think of a way to fuse col_offsets with bias.
-struct FBGEMM_API PackedFCWeight {
+struct FBGEMM_API PackedLinearWeight {
   std::unique_ptr<fbgemm::PackBMatrix<int8_t>> w;
   std::vector<int32_t> col_offsets;
   float w_scale;
@@ -28,17 +28,24 @@ struct FBGEMM_API PackedConvWeight {
   int32_t w_zp;
 };
 
-// Convert the weight from uint8 to int8.
+// PackWeight: Convert the weight from uint8 to int8.
 static void convert_uint8_int8(
-    int K,
-    int N,
+    int len,
     const uint8_t* src_uint8,
     int8_t* dst_int8) {
-  for (size_t i = 0; i < N; ++i) {
-    for (size_t j = 0; j < K; ++j) {
-      dst_int8[i * K + j] =
-          static_cast<int8_t>(static_cast<int32_t>(src_uint8[i * K + j]) - 128);
-    }
+  for (int i = 0; i < len; ++i) {
+    dst_int8[i] = static_cast<int8_t>(static_cast<int32_t>(src_uint8[i]) - 128);
+  }
+}
+
+// UnpackWeight: Convert the weight from int8 to uint8.
+static void convert_int8_uint8(
+    int len,
+    const int8_t* src_int8,
+    uint8_t* dst_uint8) {
+  for (int i = 0; i < len; ++i) {
+    dst_uint8[i] =
+        static_cast<uint8_t>(static_cast<int32_t>(src_int8[i]) + 128);
   }
 }
 
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -43,7 +43,8 @@ class QLinearInt8 final : public c10::OperatorKernel {
     }
 
     // Pull out the PackBMatrix and col_offsets instance from the owning tensor.
-    auto& pack_ptr = cpp_custom_type_hack::cast<PackedFCWeight>(packed_weight);
+    auto& pack_ptr =
+        cpp_custom_type_hack::cast<PackedLinearWeight>(packed_weight);
     auto packB = pack_ptr.w.get();
     // packB->printPackedMatrix("packedB inside fbgemm_linear (QLinearInt8): ");
     auto& col_offsets = pack_ptr.col_offsets;
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -11,7 +11,7 @@
 namespace caffe2 {
 #ifdef USE_FBGEMM
 // Required for cpp_custom_type_hack to work
-CAFFE_KNOWN_TYPE(PackedFCWeight);
+CAFFE_KNOWN_TYPE(PackedLinearWeight);
 #endif // USE_FBGEMM
 } // namespace caffe2
 
@@ -42,6 +42,10 @@ class QLinearPackWeightInt8 final : public c10::OperatorKernel {
   }
 
   at::Tensor operator()(at::Tensor weight) {
+    TORCH_CHECK(
+        weight.dim() == 2,
+        "The weight tensor for quantized::fbgemm_linear_prepack should be 2-dimensional.");
+
     auto N = weight.size(0);
     auto K = weight.size(1);
 
@@ -61,7 +65,7 @@ class QLinearPackWeightInt8 final : public c10::OperatorKernel {
         /*B_zero_point=*/weight_zero_point_int32,
         /*col_offsets=*/col_offsets.data());
 
-    auto ret_ptr = guts::make_unique<PackedFCWeight>(PackedFCWeight{
+    auto ret_ptr = guts::make_unique<PackedLinearWeight>(PackedLinearWeight{
         guts::make_unique<fbgemm::PackBMatrix<int8_t>>(
             /*trans=*/fbgemm::matrix_op_t::Transpose,
             /*nRow=*/K,
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp
@@ -0,0 +1,60 @@
+#include <ATen/ATen.h>
+#include <ATen/core/Type.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cpp_custom_type_hack.h>
+#include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/quantized/Quantizer.h>
+
+namespace at {
+namespace native {
+namespace {
+
+class QLinearUnpackWeightInt8 final : public c10::OperatorKernel {
+ public:
+#ifdef USE_FBGEMM
+  at::Tensor operator()(at::Tensor packed_weight) {
+    // Pull out the PackBMatrix instance from the owning tensor.
+    auto& pack_ptr =
+        cpp_custom_type_hack::cast<PackedLinearWeight>(packed_weight);
+    auto packB = pack_ptr.w.get();
+
+    int64_t N = static_cast<int64_t>(packB->numCols());
+    int64_t K = static_cast<int64_t>(packB->numRows());
+
+    float weight_scale_float = pack_ptr.w_scale;
+    int32_t weight_zero_point_int32 = pack_ptr.w_zp;
+
+    auto weight_origin = _empty_affine_quantized(
+        {N, K},
+        at::device(kCPU).dtype(kQInt8),
+        weight_scale_float,
+        weight_zero_point_int32);
+    int8_t* weight_ptr_int8 =
+        reinterpret_cast<int8_t*>(weight_origin.data<c10::qint8>());
+
+    // packB->printPackedMatrix("packedB inside fbgemm_unpack
+    // (QLinearUnpackWeightInt8): ");
+    packB->unpack(weight_ptr_int8);
+
+    return weight_origin;
+  }
+#else // USE_FBGEMM
+  at::Tensor operator()(at::Tensor /* weight */
+  ) {
+    // We make a strong guarantee that models using these operators will have
+    // the same numerics across different machines. Therefore, we do not provide
+    // a fallback path and rather fail loudly if we cannot run FBGEMM.
+    TORCH_CHECK(
+        false, "This PyTorch installation was not built with FBGEMM operators");
+  }
+#endif // USE_FBGEMM
+};
+
+static auto registry = c10::RegisterOperators().op(
+    "quantized::fbgemm_linear_unpack(Tensor W_prepack) -> Tensor W_origin",
+    c10::RegisterOperators::options().kernel<QLinearUnpackWeightInt8>(
+        CPUTensorId()));
+
+} // namespace
+} // namespace native
+} // namespace at
diff --git a/test/test_quantized.py b/test/test_quantized.py
@@ -451,6 +451,26 @@ def test_qlinear_relu(self):
         # Assert equal
         np.testing.assert_equal(Y_q_ref2.int_repr().numpy(), Y_q.int_repr().numpy())
 
+    """Tests the correctness of the quantized::fbgemm_linear_unpack op."""
+    @given(Q=qtensor(shapes=array_shapes(2, 2,), dtypes=((torch.qint8, np.int8, None),)))
+    def test_qlinear_unpack(self, Q):
+        W, (W_scale, W_zp), (qmin, qmax), (torch_type, np_type) = Q
+        qlinear_prepack = torch.ops.quantized.fbgemm_linear_prepack
+        qlinear_unpack = torch.ops.quantized.fbgemm_linear_unpack
+
+        W = torch.from_numpy(W)
+        W_q = torch.quantize_linear(W, scale=W_scale, zero_point=W_zp, dtype=torch_type)
+
+        # Weight prepacking operator for quantized Linear
+        W_prepack = qlinear_prepack(W_q)
+        # Weight unpack operator for quantized Linear (Used for serialization)
+        W_q_origin = qlinear_unpack(W_prepack)
+
+        # Assert equal
+        np.testing.assert_equal(W_q.int_repr(), W_q_origin.int_repr().numpy())
+        np.testing.assert_equal(W_q.q_scale(), W_q_origin.q_scale())
+        np.testing.assert_equal(W_q.q_zero_point(), W_q_origin.q_zero_point())
+
 
 @unittest.skipIf(
     TEST_WITH_UBSAN or not torch.fbgemm_is_cpu_supported(),

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,8 @@ class QLinearInt8 final : public c10::OperatorKernel {`
`43`	`43`	`}`
`44`	`44`
`45`	`45`	`// Pull out the PackBMatrix and col_offsets instance from the owning tensor.`
`46`		`- auto& pack_ptr = cpp_custom_type_hack::cast<PackedFCWeight>(packed_weight);`
	`46`	`+ auto& pack_ptr =`
	`47`	`+ cpp_custom_type_hack::cast<PackedLinearWeight>(packed_weight);`
`47`	`48`	`auto packB = pack_ptr.w.get();`
`48`	`49`	`// packB->printPackedMatrix("packedB inside fbgemm_linear (QLinearInt8): ");`
`49`	`50`	`auto& col_offsets = pack_ptr.col_offsets;`