Move dropout and alpha dropout to ATen (pytorch#10384)

apaszke · Rob Kunkle · commit 543faed9e73a · 2018-08-15T11:32:15.000-07:00
Summary: zdevito ezyang Pull Request resolved: pytorch#10384 Reviewed By: ezyang Differential Revision: D9272583 Pulled By: apaszke fbshipit-source-id: ed5d37b28ce9ff25800bbaa0daf066cfbf1f9921
diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp
@@ -0,0 +1,118 @@
+#include "ATen/ATen.h"
+#include "ATen/Dispatch.h"
+
+namespace at { namespace native {
+
+namespace {
+
+Tensor make_feature_noise(const Tensor& input) {
+  auto input_sizes = input.sizes();
+  AT_CHECK(input.dim() >= 2, "Feature dropout requires at least 2 dimensions in the input");
+  std::vector<int64_t> sizes;
+  sizes.reserve(input.dim());
+  sizes.push_back(input_sizes[0]);
+  sizes.push_back(input_sizes[1]);
+  for (int64_t i = 2; i < input.dim(); ++i)
+    sizes.push_back(1);
+  return at::empty(sizes, input.options());
+}
+
+bool is_fused_kernel_acceptable(const Tensor& input, double p) {
+  return input.is_cuda() && p > 0 && p < 1;
+}
+
+// NB: sure, we could have used different overloads here, but I would feel insecure
+// knowing that this dispatch depends only on the constness of the references
+template<bool inplace>
+Tensor& multiply(Tensor& input, const Tensor& noise) {
+  static_assert(inplace, "Wrong multiply overload triggered in Dropout.cpp");
+  return input.mul_(noise);
+}
+
+template<bool inplace>
+Tensor multiply(const Tensor& input, const Tensor& noise) {
+  static_assert(!inplace, "Wrong multiply overload triggered in Dropout.cpp");
+  return input.mul(noise);
+}
+
+template<bool feature_dropout, bool alpha_dropout, bool inplace, typename T>
+typename std::conditional<inplace, Tensor&, Tensor>::type
+_dropout_impl(T& input, double p, bool train) {
+  AT_CHECK(p >= 0 && p <= 1, "dropout probability has to be between 0 and 1, but got ", p);
+  if (p == 0 || !train) {
+    return input;
+  }
+
+  if (p == 1) {
+    return multiply<inplace>(input, at::zeros({}, input.options()));
+  }
+
+  at::Tensor b; // used for alpha_dropout only
+  auto noise = feature_dropout ? make_feature_noise(input) : at::empty_like(input);
+  noise.bernoulli_(1 - p);
+  if (alpha_dropout) {
+    constexpr double alpha = 1.7580993408473766;
+    double a = 1. / std::sqrt((alpha * alpha * p + 1) * (1 - p));
+    b = noise.add(-1).mul_(alpha * a).add_(alpha * a * p);
+    noise.mul_(a);
+  } else {
+    noise.div_(1 - p);
+  }
+
+  if (!alpha_dropout) {
+    return multiply<inplace>(input, noise);
+  } else {
+    return multiply<inplace>(input, noise).add_(b);
+  }
+}
+
+#define ALIAS_SPECIALIZATION(ALIAS_NAME, IS_FEATURE, IS_ALPHA)                 \
+template <bool inplace, typename... Args>                                      \
+typename std::conditional<inplace, Tensor&, Tensor>::type                      \
+ALIAS_NAME(Args&&... args) {                                                   \
+  return _dropout_impl<IS_FEATURE, IS_ALPHA, inplace>(std::forward<Args>(args)...); \
+}
+
+ALIAS_SPECIALIZATION(_dropout,               false, false)
+ALIAS_SPECIALIZATION(_feature_dropout,       true,  false)
+ALIAS_SPECIALIZATION(_alpha_dropout,         false, true )
+ALIAS_SPECIALIZATION(_feature_alpha_dropout, true,  true )
+
+} // anomymous namepsace
+
+Tensor dropout(const Tensor& input, double p, bool train) {
+  if (is_fused_kernel_acceptable(input, p)) {
+    return std::get<0>(input._fused_dropout(1 - p));
+  }
+  return _dropout<false>(input, p, train);
+}
+
+Tensor& dropout_(Tensor& input, double p, bool train) {
+  return _dropout<true>(input, p, train);
+}
+
+Tensor feature_dropout(const Tensor& input, double p, bool train) {
+  return _feature_dropout<false>(input, p, train);
+}
+
+Tensor& feature_dropout_(Tensor& input, double p, bool train) {
+  return _feature_dropout<true>(input, p, train);
+}
+
+Tensor alpha_dropout(const Tensor& input, double p, bool train) {
+  return _alpha_dropout<false>(input, p, train);
+}
+
+Tensor& alpha_dropout_(Tensor& input, double p, bool train) {
+  return _alpha_dropout<true>(input, p, train);
+}
+
+Tensor feature_alpha_dropout(const Tensor& input, double p, bool train) {
+  return _feature_alpha_dropout<false>(input, p, train);
+}
+
+Tensor& feature_alpha_dropout_(Tensor& input, double p, bool train) {
+  return _feature_alpha_dropout<true>(input, p, train);
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -62,6 +62,30 @@
   dispatch:
      CUDA: masked_scale_cuda
 
+- func: dropout(Tensor input, double p, bool train) -> Tensor
+  variants: function
+
+- func: dropout_(Tensor self, double p, bool train) -> Tensor
+  variants: function
+
+- func: feature_dropout(Tensor input, double p, bool train) -> Tensor
+  variants: function
+
+- func: feature_dropout_(Tensor self, double p, bool train) -> Tensor
+  variants: function
+
+- func: alpha_dropout(Tensor input, double p, bool train) -> Tensor
+  variants: function
+
+- func: alpha_dropout_(Tensor self, double p, bool train) -> Tensor
+  variants: function
+
+- func: feature_alpha_dropout(Tensor input, double p, bool train) -> Tensor
+  variants: function
+
+- func: feature_alpha_dropout_(Tensor self, double p, bool train) -> Tensor
+  variants: function
+
 - func: abs(Tensor self) -> Tensor
 
 - func: abs_(Tensor self) -> Tensor
diff --git a/test/expect/TestJit.test_alexnet.expect b/test/expect/TestJit.test_alexnet.expect
@@ -44,22 +44,23 @@ graph(%0 : Double(1, 3, 224, 224)
   %46 : int = prim::Constant[value=9216](), scope: AlexNet
   %47 : int[] = prim::ListConstruct(%45, %46), scope: AlexNet
   %48 : Double(1, 9216) = aten::view(%41, %47), scope: AlexNet
-  %49 : Double(1, 9216) = ^Dropout(0.5, True, False)(%48), scope: AlexNet/Sequential[classifier]/Dropout[0]
-  %50 : Double(9216!, 4096!) = aten::t(%11), scope: AlexNet/Sequential[classifier]/Linear[1]
-  %51 : int = prim::Constant[value=4096](), scope: AlexNet/Sequential[classifier]/Linear[1]
-  %52 : int[] = prim::ListConstruct(%21, %51), scope: AlexNet/Sequential[classifier]/Linear[1]
-  %53 : Double(1, 4096) = aten::expand(%12, %52, %21), scope: AlexNet/Sequential[classifier]/Linear[1]
-  %54 : Double(1, 4096) = aten::addmm(%53, %49, %50, %21, %21), scope: AlexNet/Sequential[classifier]/Linear[1]
-  %55 : Double(1, 4096) = aten::threshold(%54, %23, %23), scope: AlexNet/Sequential[classifier]/ReLU[2]
-  %56 : Double(1, 4096) = ^Dropout(0.5, True, False)(%55), scope: AlexNet/Sequential[classifier]/Dropout[3]
-  %57 : Double(4096!, 4096!) = aten::t(%13), scope: AlexNet/Sequential[classifier]/Linear[4]
-  %58 : Double(1, 4096) = aten::expand(%14, %52, %21), scope: AlexNet/Sequential[classifier]/Linear[4]
-  %59 : Double(1, 4096) = aten::addmm(%58, %56, %57, %21, %21), scope: AlexNet/Sequential[classifier]/Linear[4]
-  %60 : Double(1, 4096) = aten::threshold(%59, %23, %23), scope: AlexNet/Sequential[classifier]/ReLU[5]
-  %61 : Double(4096!, 1000!) = aten::t(%15), scope: AlexNet/Sequential[classifier]/Linear[6]
-  %62 : int = prim::Constant[value=1000](), scope: AlexNet/Sequential[classifier]/Linear[6]
-  %63 : int[] = prim::ListConstruct(%21, %62), scope: AlexNet/Sequential[classifier]/Linear[6]
-  %64 : Double(1, 1000) = aten::expand(%16, %63, %21), scope: AlexNet/Sequential[classifier]/Linear[6]
-  %65 : Double(1, 1000) = aten::addmm(%64, %60, %61, %21, %21), scope: AlexNet/Sequential[classifier]/Linear[6]
-  return (%65);
+  %49 : float = prim::Constant[value=0.5](), scope: AlexNet/Sequential[classifier]/Dropout[0]
+  %50 : Double(1!, 9216) = aten::dropout(%48, %49, %21), scope: AlexNet/Sequential[classifier]/Dropout[0]
+  %51 : Double(9216!, 4096!) = aten::t(%11), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %52 : int = prim::Constant[value=4096](), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %53 : int[] = prim::ListConstruct(%21, %52), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %54 : Double(1, 4096) = aten::expand(%12, %53, %21), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %55 : Double(1, 4096) = aten::addmm(%54, %50, %51, %21, %21), scope: AlexNet/Sequential[classifier]/Linear[1]
+  %56 : Double(1, 4096) = aten::threshold(%55, %23, %23), scope: AlexNet/Sequential[classifier]/ReLU[2]
+  %57 : Double(1!, 4096) = aten::dropout(%56, %49, %21), scope: AlexNet/Sequential[classifier]/Dropout[3]
+  %58 : Double(4096!, 4096!) = aten::t(%13), scope: AlexNet/Sequential[classifier]/Linear[4]
+  %59 : Double(1, 4096) = aten::expand(%14, %53, %21), scope: AlexNet/Sequential[classifier]/Linear[4]
+  %60 : Double(1, 4096) = aten::addmm(%59, %57, %58, %21, %21), scope: AlexNet/Sequential[classifier]/Linear[4]
+  %61 : Double(1, 4096) = aten::threshold(%60, %23, %23), scope: AlexNet/Sequential[classifier]/ReLU[5]
+  %62 : Double(4096!, 1000!) = aten::t(%15), scope: AlexNet/Sequential[classifier]/Linear[6]
+  %63 : int = prim::Constant[value=1000](), scope: AlexNet/Sequential[classifier]/Linear[6]
+  %64 : int[] = prim::ListConstruct(%21, %63), scope: AlexNet/Sequential[classifier]/Linear[6]
+  %65 : Double(1, 1000) = aten::expand(%16, %64, %21), scope: AlexNet/Sequential[classifier]/Linear[6]
+  %66 : Double(1, 1000) = aten::addmm(%65, %61, %62, %21, %21), scope: AlexNet/Sequential[classifier]/Linear[6]
+  return (%66);
 }
diff --git a/test/expect/TestJit.test_dropout.expect b/test/expect/TestJit.test_dropout.expect
@@ -1,4 +1,6 @@
 graph(%0 : Double(2, 2)) {
-  %1 : Double(2, 2) = ^Dropout(0.6, True, False)(%0), scope: Dropout
-  return (%1);
+  %1 : float = prim::Constant[value=0.6](), scope: Dropout
+  %2 : int = prim::Constant[value=1](), scope: Dropout
+  %3 : Double(2, 2) = aten::dropout(%0, %1, %2), scope: Dropout
+  return (%3);
 }
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
@@ -228,6 +228,9 @@
   self: grad * tensor
   tensor: grad * self
 
+- name: _fused_dropout(Tensor self, double p, Generator generator)
+  self: _fused_dropout_backward(grad, result1, p)
+
 - name: eig(Tensor self, bool eigenvectors)
   self: not_implemented("eig")
 
diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp
@@ -525,6 +525,16 @@ Tensor repeat_backward(Tensor grad, int64_t input_dims, IntList repeats) {
   return grad;
 }
 
+// p1m == 1 - p
+Tensor _fused_dropout_backward(Tensor grad, Tensor mask, double p1m) {
+  if (grad.requires_grad()) {
+    // Use autograd-friendly backward if double backward is required
+    return grad * (mask.type_as(grad) * (1. / p1m));
+  } else {
+    return grad._masked_scale(mask, 1. / p1m);
+  }
+}
+
 Tensor select_equals_backward(Tensor grad, const Tensor & input, const Tensor & value) {
   auto grad_input = zeros_like(input);
   grad_input.masked_fill_(input == value, grad);
diff --git a/torch/nn/_functions/dropout.py b/torch/nn/_functions/dropout.py
diff --git a/torch/nn/backends/thnn.py b/torch/nn/backends/thnn.py
@@ -22,16 +22,12 @@ def _initialize_backend():
     from .._functions.thnn import _all_functions as _thnn_functions
     from .._functions.rnn import RNN, \
         RNNTanhCell, RNNReLUCell, GRUCell, LSTMCell
-    from .._functions.dropout import Dropout, FeatureDropout
 
     backend.register_function('RNN', RNN)
     backend.register_function('RNNTanhCell', RNNTanhCell)
     backend.register_function('RNNReLUCell', RNNReLUCell)
     backend.register_function('LSTMCell', LSTMCell)
     backend.register_function('GRUCell', GRUCell)
-    backend.register_function('Dropout', Dropout)
-    backend.register_function('Dropout2d', FeatureDropout)
-    backend.register_function('Dropout3d', FeatureDropout)
     for cls in _thnn_functions:
         name = cls.__name__
         backend.register_function(name, cls)
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,6 @@`
`1`	`1`	`graph(%0 : Double(2, 2)) {`
`2`		`- %1 : Double(2, 2) = ^Dropout(0.6, True, False)(%0), scope: Dropout`
`3`		`- return (%1);`
	`2`	`+ %1 : float = prim::Constant[value=0.6](), scope: Dropout`
	`3`	`+ %2 : int = prim::Constant[value=1](), scope: Dropout`
	`4`	`+ %3 : Double(2, 2) = aten::dropout(%0, %1, %2), scope: Dropout`
	`5`	`+ return (%3);`
`4`	`6`	`}`