add fused dropout kernels

Natalia Gimelshein · Natalia Gimelshein · commit 30c0373cb482 · 2018-07-20T22:13:46.000-07:00
diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
@@ -0,0 +1,159 @@
+#include "ATen/ATen.h"
+#include "ATen/AccumulateType.h"
+#include "ATen/cuda/CUDAApplyUtils.cuh"
+#include "detail/IndexUtils.cuh"
+#include "detail/TensorInfo.cuh"
+#include "curand_kernel.h"
+
+#include <THC/THCGeneral.h>
+#include <THC/THCTensorRandom.h>
+#include <THC/THCGenerator.hpp>
+
+
+THCGenerator* THCRandom_getGenerator(THCState* state);
+
+namespace at{
+namespace native{
+
+namespace {
+
+//due to limitations of philox generator UNROLL has to be 4
+const int UNROLL = 4;
+
+std::pair<uint64_t, uint64_t> next_philox_seed(at::Generator* gen, uint64_t increment) {
+  auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState());
+  uint64_t offset = gen_->state.philox_seed_offset.fetch_add(increment);
+  return std::make_pair(gen_->state.initial_seed, offset);
+}
+
+
+template <
+          typename scalar_t,
+          typename accscalar_t,
+          typename IndexType,
+          int ADims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(256,8)
+#endif
+__global__ void
+fused_dropout_kernel(cuda::detail::TensorInfo<scalar_t, IndexType> a,
+                      cuda::detail::TensorInfo<scalar_t, IndexType> b,
+                      cuda::detail::TensorInfo<uint8_t, IndexType> c,
+                      IndexType totalElements, accscalar_t p, std::pair<uint64_t, uint64_t> seeds
+                      ) {
+
+  accscalar_t pinv = accscalar_t(1)/p;
+  IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  curandStatePhilox4_32_10_t state;
+    curand_init(
+        seeds.first,
+        idx,
+        seeds.second,
+        &state);
+  IndexType rounded_size = ((totalElements - 1)/(blockDim.x*gridDim.x*UNROLL)+1)*blockDim.x*gridDim.x*UNROLL;
+  for (IndexType linearIndex = idx;
+       linearIndex < rounded_size;
+       linearIndex += gridDim.x * blockDim.x*UNROLL) {
+//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
+       float4 rand = curand_uniform4(&state);
+       scalar_t src[UNROLL];
+       rand.x = rand.x < p;
+       rand.y = rand.y < p;
+       rand.z = rand.z < p;
+       rand.w = rand.w < p;
+       for (int ii = 0; ii < UNROLL; ii++) {
+           IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
+           if (li < totalElements) {
+    // Convert `linearIndex` into an offset of `a`
+               const IndexType aOffset =
+                   cuda::detail::IndexToOffset<scalar_t, IndexType, ADims>::get(li, a);
+               src[ii] = a.data[aOffset];
+           }
+       }
+       for (int ii = 0; ii < UNROLL; ii++) {
+           IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
+           if (li < totalElements) {
+    // Convert `linearIndex` into an offset of `b`
+               const IndexType bOffset =
+                   cuda::detail::IndexToOffset<scalar_t, IndexType, 1>::get(li, b);
+               b.data[bOffset] = src[ii]*(&rand.x)[ii]*pinv;
+               c.data[bOffset] = (uint8_t)(&rand.x)[ii];
+           }
+       }
+       __syncthreads();
+  }
+}
+
+template<typename scalar_t, typename accscalar_t>
+void masked_scale_kernel(at::Tensor& ret, const at::Tensor src, const at::Tensor mask, accscalar_t scale){
+   at::cuda::CUDA_tensor_apply3<scalar_t, scalar_t, uint8_t>(ret, src, mask, [scale]__device__(scalar_t& ret_val, const scalar_t& src_val, const uint8_t mask_val){
+       ret_val = mask_val * src_val * scale;
+  });
+}
+} //anonymous namespace
+
+std::tuple<Tensor,Tensor>
+fused_dropout_cuda(const Tensor& self, double p, Generator * gen){
+  Tensor ret = at::empty_like(self);
+  Tensor mask = self.type().toScalarType(kByte).tensor(self.sizes());
+  const int64_t nelem = self.numel();
+  int64_t block_size = 256;
+  unsigned int blocks_per_sm = at::globalContext().getCurrentDeviceProperties()->maxThreadsPerMultiProcessor/block_size;
+  dim3 dim_block(block_size);
+  dim3 grid((nelem + block_size -1)/block_size);
+  grid.x = std::min((unsigned int)at::globalContext().getCurrentDeviceProperties()->multiProcessorCount * blocks_per_sm, grid.x);
+  int64_t nrep = ((nelem - 1)/(block_size*grid.x*UNROLL)+1)*UNROLL;
+  if (cuda::detail::canUse32BitIndexMath(self)){
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "fused_dropout", [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+      accscalar_t pa = (accscalar_t)(p);
+      auto self_info = cuda::detail::getTensorInfo<scalar_t, unsigned int>(self);
+      auto ret_info = cuda::detail::getTensorInfo<scalar_t, unsigned int>(ret);
+      auto mask_info = cuda::detail::getTensorInfo<uint8_t, unsigned int>(mask);
+      self_info.collapseDims();
+      ret_info.collapseDims();
+      mask_info.collapseDims(); //ret and mask are collapsed to 1d contiguous tensor
+      switch (self_info.dims) {
+        case 1:
+            fused_dropout_kernel<scalar_t, accscalar_t, unsigned int, 1><<<grid, dim_block, 0, globalContext().getCurrentCUDAStream()>>>(self_info, ret_info, mask_info, nelem, pa, next_philox_seed(gen,nrep));
+            break;
+        default:
+            fused_dropout_kernel<scalar_t, accscalar_t, unsigned int, -1><<<dim_block, grid, 0, globalContext().getCurrentCUDAStream()>>>(self_info, ret_info, mask_info, nelem, pa, next_philox_seed(gen,nrep));
+      }
+   });
+  } else {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "fused_dropout", [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+      accscalar_t pa = (accscalar_t)(p);
+      auto self_info = cuda::detail::getTensorInfo<scalar_t, uint64_t>(self);
+      auto ret_info = cuda::detail::getTensorInfo<scalar_t, uint64_t>(ret);
+      auto mask_info = cuda::detail::getTensorInfo<uint8_t, uint64_t>(mask);
+      self_info.collapseDims();
+      ret_info.collapseDims();
+      mask_info.collapseDims(); //ret and mask are collapsed to 1d contiguous tensor
+      switch (self_info.dims) {
+        case 1:
+            fused_dropout_kernel<scalar_t, accscalar_t, uint64_t, 1><<<dim_block, grid, 0, globalContext().getCurrentCUDAStream()>>>(self_info, ret_info, mask_info, nelem, pa, next_philox_seed(gen,nrep));
+            break;
+        default:
+            fused_dropout_kernel<scalar_t, accscalar_t, uint64_t, -1><<<dim_block, grid, 0, globalContext().getCurrentCUDAStream()>>>(self_info, ret_info, mask_info, nelem, pa, next_philox_seed(gen,nrep));
+      }
+   });
+  }
+  THCudaCheck(cudaGetLastError());
+  return std::tuple<Tensor,Tensor>(ret, mask);
+}
+
+Tensor masked_scale_cuda(const Tensor& self, const Tensor& mask, double scale){
+   Tensor ret = at::empty_like(self);
+   AT_CHECK(mask.type().scalarType() == at::ScalarType::Byte, "mask should be torch.uint8 dtype");
+   AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.type(), "masked_scale", [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+      accscalar_t pa = (accscalar_t)(scale);
+    masked_scale_kernel<scalar_t>(ret, self, mask, pa);
+  });
+  return ret;
+}
+
+}
+}
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -495,6 +495,10 @@
 - func: dot_out(Tensor result, Tensor self, Tensor tensor) -> Tensor
   variants: function
 
+- func: fused_dropout(Tensor self, double p, Generator* generator=nullptr) -> (Tensor, Tensor)
+  dispatch:
+     CUDA: fused_dropout_cuda
+
 - func: einsum(std::string equation, TensorList tensors) -> Tensor
   variants: function
 
@@ -905,6 +909,11 @@
 - func: logsumexp_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor
   variants: function
 
+- func: masked_scale(Tensor self, Tensor mask, double scale) -> Tensor
+  dispatch:
+     CUDA: masked_scale_cuda
+
+
 - func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, double margin=0.0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
   variants: function
 
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -712,9 +712,10 @@ def test_no_grad(self):
             self.assertFalse(output2.requires_grad)
             self.assertRaises(RuntimeError, lambda: output2.backward(torch.ones(1, 5, 10, 10)))
 
-    def _test_dropout(self, cls, input):
+    def _test_dropout(self, cls, cuda, input):
         p = 0.2
-        input.fill_(1 - p)
+        device = torch.device("cuda") if cuda else torch.device("cpu")
+        input = input.to(device).fill_(1 - p)
 
         module = cls(p)
         input_var = torch.tensor(input, requires_grad=True)
@@ -2077,15 +2078,15 @@ def func(x):
 
     def test_Dropout(self):
         input = torch.Tensor(1000)
-        self._test_dropout(nn.Dropout, input)
+        self._test_dropout(nn.Dropout, False, input)
 
     def test_Dropout2d(self):
         b = random.randint(1, 5)
         w = random.randint(1, 5)
         h = random.randint(1, 5)
         num_features = 1000
         input = torch.Tensor(num_features, b, w, h)
-        self._test_dropout(nn.Dropout2d, input)
+        self._test_dropout(nn.Dropout2d, False, input)
 
     def test_Dropout3d(self):
         b = random.randint(1, 5)
@@ -2094,7 +2095,31 @@ def test_Dropout3d(self):
         d = random.randint(1, 2)
         num_features = 1000
         input = torch.Tensor(num_features, b, d, w, h)
-        self._test_dropout(nn.Dropout3d, input)
+        self._test_dropout(nn.Dropout3d, False, input)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_Dropout_cuda(self):
+        input = torch.Tensor(1000)
+        self._test_dropout(nn.Dropout, True, input)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_Dropout2d_cuda(self):
+        b = random.randint(1, 5)
+        w = random.randint(1, 5)
+        h = random.randint(1, 5)
+        num_features = 1000
+        input = torch.Tensor(num_features, b, w, h)
+        self._test_dropout(nn.Dropout2d, True, input)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_Dropout3d_cuda(self):
+        b = random.randint(1, 5)
+        w = random.randint(1, 5)
+        h = random.randint(1, 5)
+        d = random.randint(1, 2)
+        num_features = 1000
+        input = torch.Tensor(num_features, b, d, w, h)
+        self._test_dropout(nn.Dropout3d, True, input)
 
     def test_AlphaDropout(self):
         # generate random tensor with zero mean and unit std
diff --git a/torch/nn/_functions/dropout.py b/torch/nn/_functions/dropout.py
@@ -1,6 +1,7 @@
 import torch
 from torch.autograd.function import InplaceFunction
 from itertools import repeat
+from torch.autograd.function import once_differentiable
 
 
 class Dropout(InplaceFunction):
@@ -15,6 +16,10 @@ def symbolic(g, input, p=0.5, train=False, inplace=False):
         r, _ = g.op("Dropout", input, ratio_f=p, is_test_i=not train, outputs=2)
         return r
 
+    @staticmethod
+    def _fused_kernel_acceptable(input, p, cls_name, inplace):
+        return input.is_cuda and p > 0 and p < 1 and not inplace and cls_name == 'Dropout'
+
     @classmethod
     def forward(cls, ctx, input, p=0.5, train=False, inplace=False):
         if p < 0 or p > 1:
@@ -23,10 +28,15 @@ def forward(cls, ctx, input, p=0.5, train=False, inplace=False):
         ctx.p = p
         ctx.train = train
         ctx.inplace = inplace
+        ctx.use_fused_kernel = Dropout._fused_kernel_acceptable(input, ctx.p, cls.__name__, ctx.inplace)
 
         if ctx.p == 0 or not ctx.train:
             return input
 
+        if ctx.use_fused_kernel:
+            output, ctx.noise = input.fused_dropout(1 - ctx.p)
+            return output
+
         if ctx.inplace:
             ctx.mark_dirty(input)
             output = input
@@ -45,7 +55,13 @@ def forward(cls, ctx, input, p=0.5, train=False, inplace=False):
 
     @staticmethod
     def backward(ctx, grad_output):
-        if ctx.p > 0 and ctx.train:
+        if ctx.use_fused_kernel:
+            if not grad_output.requires_grad:
+                return grad_output.masked_scale(ctx.noise, 1. / (1 - ctx.p)), None, None, None
+            else:
+                # use autograd-friendly backward if double backward is required
+                return grad_output * (ctx.noise.type_as(grad_output) * (1. / (1 - ctx.p))), None, None, None
+        elif ctx.p > 0 and ctx.train:
             return grad_output * ctx.noise, None, None, None
         else:
             return grad_output, None, None, None