diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
index 4b4014ba42d37..d9b2a2e096a1e 100755
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@@ -226,7 +226,7 @@ else
     export MAX_JOBS=`expr $(nproc) - 1`
   fi
 
-  FULL_CAFFE2=1 python setup.py install --user
+  USE_OPENCV=1 BUILD_BINARY=1 python setup.py install --user
 
   # This is to save test binaries for testing
   cp -r torch/lib/tmp_install $INSTALL_PREFIX
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9fdfe4aef7f94..b7f56f96e87f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,7 +56,7 @@ include(CMakeDependentOption)
 option(BUILD_TORCH "Build Torch" OFF)
 option(ATEN_NO_TEST "Do not build ATen test binaries" OFF)
 option(BUILD_ATEN_MOBILE "Build ATen for Android and iOS" OFF)
-option(BUILD_BINARY "Build C++ binaries" ON)
+option(BUILD_BINARY "Build C++ binaries" OFF)
 option(BUILD_DOCS "Build Caffe2 documentation" OFF)
 option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON)
 option(BUILD_PYTHON "Build Python binaries" ON)
@@ -115,7 +115,7 @@ option(USE_IDEEP "Use IDEEP interface in MKL BLAS" ON)
 option(USE_MKLML "Use MKLML interface in MKL BLAS" ON)
 option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(
-    USE_MPI "Use MPI. Only available if USE_DISTRIBUTED is on." ON
+    USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." OFF
     "USE_DISTRIBUTED" OFF)
 cmake_dependent_option(
     USE_GLOO "Use Gloo. Only available if USE_DISTRIBUTED is on." ON
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 77953c9e18e60..40cb364e91fd0 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -107,14 +107,14 @@ bool Context::setFlushDenormal(bool on) {
 #endif
 }
 
-Type& getMaybeVariableType(TensorOptions options) {
-  return globalContext().getMaybeVariableType(
+Type& getType(TensorOptions options) {
+  return globalContext().getType(
             options.backend(), options.dtype(), options.is_variable());
 }
 
-Type& getMaybeVariableType(const TensorImpl* impl) {
+Type& getType(const TensorImpl* impl) {
   Backend backend = tensorTypeIdToBackend(impl->type_id());
-  return globalContext().getMaybeVariableType(
+  return globalContext().getType(
             backend, impl->scalar_type(), impl->is_variable());
 }
 
@@ -122,4 +122,18 @@ Allocator* getCPUAllocator() {
   return getTHDefaultAllocator();
 }
 
+struct LegacyTypeInit : public LegacyTypeInitInterface {
+  LegacyTypeInit(LegacyTypeInitArgs) {}
+  void initCPU() const override {
+    globalContext();
+  }
+  void initCUDA() const override {
+    globalContext().lazyInitCUDA();
+  }
+  void initComplex() const override {
+    globalContext().lazyInitComplex();
+  }
+};
+REGISTER_LEGACY_TYPE_INIT(LegacyTypeInit);
+
 }
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index faf451ef72d5d..7b3634dd83086 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -8,8 +8,9 @@
 #include "ATen/Utils.h"
 #include "ATen/core/Error.h"
 #include "ATen/detail/CUDAHooksInterface.h"
-#include "ATen/detail/VariableHooksInterface.h"
+#include "ATen/core/VariableHooksInterface.h"
 #include "ATen/detail/ComplexHooksInterface.h"
+#include "ATen/core/LegacyTypeDispatch.h"
 
 // This is temporary
 #include "ATen/core/ATenCoreTest.h"
@@ -24,43 +25,25 @@ class AT_API Context {
 public:
   Context();
   Type* getNonVariableTypeRaw(Backend p, ScalarType s) {
-    return type_registry[static_cast<int>(p)][static_cast<int>(s)].get();
+    return globalLegacyTypeDispatch().getNonVariableTypeRaw(p, s);
   }
   Type * getNonVariableTypeOpt(Backend p, ScalarType s) {
-    if (p != Backend::Undefined) {
-      initCUDAIfNeeded(backendToDeviceType(p));
-      initComplexIfNeeded(s);
-    }
-    auto type = getNonVariableTypeRaw(p, s);
-
-    if(!type) {
-      // there is only a single Undefined Type.
-      if (p == Backend::Undefined || s == ScalarType::Undefined) {
-        return getNonVariableTypeRaw(Backend::Undefined, ScalarType::Undefined);
-      }
-    }
-
-    return type;
+    return globalLegacyTypeDispatch().getNonVariableTypeOpt(p, s);
   }
   Type & getNonVariableType(Backend p, ScalarType s) {
-    auto* type = getNonVariableTypeOpt(p, s);
-    if (!type) AT_ERROR(toString(p), toString(s), "Type is not enabled.");
-    return *type;
+    return globalLegacyTypeDispatch().getNonVariableType(p, s);
   }
   Type & getVariableType(Backend p, ScalarType s) {
-    auto& baseType = getNonVariableType(p, s);
-    return detail::getVariableHooks().getVariableTypeFromBaseType(baseType);
-  }
-  Type & getMaybeVariableType(Backend p, ScalarType s, bool is_variable) {
-    if (is_variable) {
-      return getVariableType(p, s);
-    } else {
-      return getNonVariableType(p, s);
-    }
+    return globalLegacyTypeDispatch().getVariableType(p, s);
+  }
+  Type & getType(Backend p, ScalarType s, bool is_variable) {
+    return globalLegacyTypeDispatch().getType(p, s, is_variable);
   }
+  // The passed in Type must be delete'able
+  // TODO: Just make it take a unique_ptr
   void registerType(Backend b, ScalarType s, Type* t) {
-    type_registry[static_cast<int>(b)][static_cast<int>(s)].reset(t);
-    detail::getVariableHooks().registerVariableTypeFor(this, b, s);
+    globalLegacyTypeDispatch().registerType(b, s,
+      LegacyTypeDispatch::TypeUniquePtr{t, LegacyTypeDeleter([](Type* p) { delete p; }) });
   }
 
   Generator & defaultGenerator(DeviceType device_type) {
@@ -127,11 +110,6 @@ class AT_API Context {
   std::unique_ptr<Generator>
     generator_registry[static_cast<int>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)];
 private:
-  // NB: type_registry has nullptr for all CUDA backends until
-  // CUDA initialization has occurred
-  std::unique_ptr<Type> type_registry
-    [static_cast<int>(Backend::NumOptions)]
-    [static_cast<int>(ScalarType::NumOptions)];
   void initCUDAIfNeeded(DeviceType p) {
     if (p == DeviceType::CUDA) {
       lazyInitCUDA();
@@ -150,8 +128,6 @@ class AT_API Context {
   std::atomic<size_t> next_id;
   std::unique_ptr<THCState, void(*)(THCState*)> thc_state;
   friend struct Type;
-  friend void register_cpu_types(Context * context);
-  friend void register_cuda_types(Context * context);
 };
 
 AT_API Context & globalContext();
@@ -174,8 +150,8 @@ static inline Type& getNonVariableType(DeviceType p, ScalarType s) {
   return globalContext().getNonVariableType(deviceTypeToBackend(p), s);
 }
 
-AT_API Type& getMaybeVariableType(TensorOptions options);
-AT_API Type& getMaybeVariableType(const TensorImpl*);
+AT_API Type& getType(TensorOptions options);
+AT_API Type& getType(const TensorImpl*);
 
 AT_API Allocator* getCPUAllocator();
 
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
index 966d57bf101d0..9d67537ccdedd 100644
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@@ -65,6 +65,7 @@
   name: _fill_
   return: self
   cname: fill
+  variants: function
   options:
     - arguments:
       - THTensor* self
@@ -204,6 +205,7 @@
 [[
   name: _indexCopy_
   cname: indexCopy
+  variants: function
   return: argument 0
   arguments:
     - THTensor* self
@@ -831,9 +833,7 @@
 ]]
 [[
   name: _th_min
-  variants:
-    - method
-    - function
+  variants: function
   options:
     - cname: min
       return: argument 0,1
@@ -870,9 +870,7 @@
 ]]
 [[
   name: _th_max
-  variants:
-    - method
-    - function
+  variants: function
   options:
     - cname: max
       return: argument 0,1
@@ -892,9 +890,7 @@
   name: _th_kthvalue
   backends:
     - CPU
-  variants:
-    - method
-    - function
+  variants: function
   cname: kthvalue
   return: argument 0,1
   scalar_check: self_->dim() == 0 || (keepdim == false && self_->dim() == 1)
@@ -913,9 +909,7 @@
 ]]
 [[
   name: _th_mode
-  variants:
-    - method
-    - function
+  variants: function
   cname: mode
   return: argument 0,1
   scalar_check: self_->dim() == 0|| (keepdim == false && self_->dim() == 1)
@@ -945,9 +939,7 @@
 ]]
 [[
   name: _th_median
-  variants:
-    - method
-    - function
+  variants: function
   cname: median
   return: argument 0,1
   options:
@@ -1023,9 +1015,7 @@
   name: _th_all
   types:
     - Byte
-  variants:
-    - method
-    - function
+  variants: function
   backends:
     - CPU
     - CUDA
@@ -1062,9 +1052,7 @@
   name: _th_any
   types:
     - Byte
-  variants:
-    - method
-    - function
+  variants: function
   backends:
     - CPU
     - CUDA
@@ -1104,9 +1092,7 @@
   backends:
     - CPU
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1120,9 +1106,7 @@
   backends:
     - CUDA
   cname: sigmoid
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1136,9 +1120,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1153,9 +1135,7 @@
   backends:
     - CPU
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1169,9 +1149,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1186,9 +1164,7 @@
   backends:
     - CPU
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1291,9 +1267,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1307,9 +1281,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1323,9 +1295,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1339,9 +1309,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1356,9 +1324,7 @@
   backends:
     - CPU
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1372,9 +1338,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1388,9 +1352,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1405,9 +1367,7 @@
   backends:
     - CPU
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1421,9 +1381,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1437,9 +1395,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1453,9 +1409,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1469,9 +1423,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1485,9 +1437,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1530,9 +1480,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1546,9 +1494,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1562,9 +1508,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1578,9 +1522,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1594,9 +1536,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1610,9 +1550,7 @@
     - floating_point
   backends:
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -1655,9 +1593,7 @@
   backends:
     - CPU
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   options:
     - cname: varall
       return: accreal
@@ -1690,9 +1626,7 @@
   backends:
     - CPU
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   options:
     - cname: stdall
       return: accreal
@@ -1743,9 +1677,7 @@
   backends:
     - CPU
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   options:
     - cname: norm
       return: argument 0
@@ -2080,9 +2012,7 @@
 ]]
 [[
   name: _sumall
-  variants:
-    - method
-    - function
+  variants: function
   options:
     - cname: sumall
       return: accreal
@@ -2091,9 +2021,7 @@
 ]]
 [[
   name: _th_sum
-  variants:
-    - method
-    - function
+  variants: function
   options:
     - cname: sum
       return: argument 0
@@ -2109,9 +2037,7 @@
 ]]
 [[
   name: _prodall
-  variants:
-    - method
-    - function
+  variants: function
   options:
     - cname: prodall
       return: accreal
@@ -2120,9 +2046,7 @@
 ]]
 [[
   name: _th_prod
-  variants:
-    - method
-    - function
+  variants: function
   options:
     - cname: prod
       return: argument 0
@@ -2139,9 +2063,7 @@
 [[
   name: _cumsum
   cname: cumsum
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -2153,9 +2075,7 @@
 [[
   name: _cumprod
   cname: cumprod
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -2269,9 +2189,7 @@
 [[
   name: _th_clamp
   cname: clamp
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -2283,9 +2201,7 @@
 [[
   name: _th_clamp_min
   cname: cmaxValue
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -2296,9 +2212,7 @@
 [[
   name: _th_clamp_max
   cname: cminValue
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -2310,9 +2224,7 @@
   name: _dot
   backend_type_pairs: [[CUDA,floating_point], [CPU,all]]
   cname: dot
-  variants:
-    - method
-    - function
+  variants: function
   return: accreal
   arguments:
     - arg: THTensor* self
@@ -2439,9 +2351,7 @@
 [[
   name: _addmv
   cname: addmv
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -2460,6 +2370,7 @@
 [[
   name: _addmv_
   cname: addmv
+  variants: function
   return: self
   arguments:
     - THTensor* self
@@ -2476,9 +2387,7 @@
 [[
   name: _addr
   cname: addr
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -2498,6 +2407,7 @@
   name: _addr_
   cname: addr
   return: self
+  variants: function
   arguments:
     - THTensor* self
     - arg: real beta
@@ -2513,9 +2423,7 @@
 [[
   name: _ger
   cname: addr
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   scalar_check: False
   arguments:
@@ -2532,9 +2440,7 @@
 [[
   name: _mv
   cname: addmv
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* result
@@ -2549,9 +2455,7 @@
 ]]
 [[
   name: _mm
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   options:
     - cname: addmm
@@ -2738,9 +2642,7 @@
   backends:
     - CPU
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0,1
   arguments:
     - arg: THTensor* solution
@@ -2856,7 +2758,7 @@
 ]]
 [[
   name: svd
-  cname: gesvd
+  cname: gesdd
   types:
     - Float
     - Double
@@ -2889,9 +2791,7 @@
   backends:
     - CPU
     - CUDA
-  variants:
-    - method
-    - function
+  variants: function
   return: argument 0
   arguments:
     - arg: THTensor* output
@@ -3342,6 +3242,7 @@
     - CUDA
   cname: bernoulli
   return: self
+  variants: function
   arguments:
     - THTensor* self
     - arg: THGenerator* generator
@@ -3355,9 +3256,7 @@
     - Float
     - Double
   return: argument 0
-  variants:
-    - method
-    - function
+  variants: function
   cname: bernoulli_Tensor
   arguments:
     - arg: THTensor* output
@@ -3435,6 +3334,7 @@
   name: _copy_ignoring_overlaps_
   cname: copyIgnoringOverlaps
   return: self
+  variants: function
   backends:
     - CUDA
   arguments:
diff --git a/aten/src/ATen/Formatting.cpp b/aten/src/ATen/Formatting.cpp
index dcdf7653f2308..390230316bd0d 100644
--- a/aten/src/ATen/Formatting.cpp
+++ b/aten/src/ATen/Formatting.cpp
@@ -1,11 +1,13 @@
 #include "ATen/Formatting.h"
-#include "ATen/Tensor.h"
-#include "ATen/TensorMethods.h"
+
+#include <ATen/ATen.h>
 
 #include <cmath>
-#include <iostream>
+#include <cstdint>
 #include <iomanip>
-
+#include <iostream>
+#include <sstream>
+#include <tuple>
 
 namespace at {
 
@@ -58,7 +60,7 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
   for(int64_t i = 0; i < size; i++) {
     auto z = self_p[i];
     if(std::isfinite(z)) {
-      if(z != ceil(z)) {
+      if(z != std::ceil(z)) {
         intMode = false;
         break;
       }
@@ -91,12 +93,12 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
       }
     }
     if(expMin != 0) {
-      expMin = floor(log10(expMin)) + 1;
+      expMin = std::floor(std::log10(expMin)) + 1;
     } else {
       expMin = 1;
     }
     if(expMax != 0) {
-      expMax = floor(log10(expMax)) + 1;
+      expMax = std::floor(std::log10(expMax)) + 1;
     } else {
       expMax = 1;
     }
@@ -114,14 +116,14 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
   } else {
     if(expMax-expMin > 4) {
       sz = 11;
-      if(fabs(expMax) > 99 || fabs(expMin) > 99) {
+      if(std::fabs(expMax) > 99 || std::fabs(expMin) > 99) {
         sz = sz + 1;
       }
       stream << std::scientific << std::setprecision(4);
     } else {
       if(expMax > 5 || expMax < 0) {
         sz = 7;
-        scale = pow(10, expMax-1);
+        scale = std::pow(10, expMax-1);
         stream << std::fixed << std::setprecision(4);
       } else {
         if(expMax == 0) {
diff --git a/aten/src/ATen/OptionsGuard.h b/aten/src/ATen/OptionsGuard.h
index 9a4ced8ff2c11..2f08d1c958b01 100644
--- a/aten/src/ATen/OptionsGuard.h
+++ b/aten/src/ATen/OptionsGuard.h
@@ -1,54 +1,2 @@
 #pragma once
-
-#include <ATen/Device.h>
-#include <ATen/core/Layout.h>
-#include <ATen/ScalarType.h>
-#include <ATen/core/TensorOptions.h>
-#include <ATen/core/optional.h>
-
-namespace at {
-
-/// A wrapper over a thread local TensorOptions instance.
-struct DefaultTensorOptions {
-  /// Returns the current thread local default options.
-  /// Defined in OptionsGuard.cpp because we can't use optional in headers, due
-  /// to Windows and other compilers.
-  AT_API static TensorOptions& get();
-
- private:
-  /// This is an optional because of compiler bugs that mis-initialize static
-  /// thread local variables. The workaround is lazy initialization, i.e.
-  /// `DefaultTensorOptions::get()` will initialize the `options_` to a proper
-  /// value upon first invocation.
-  /// https://gcc.gnu.org/ml/gcc-bugs/2013-12/msg00026.html
-  static thread_local at::optional<TensorOptions> options_;
-};
-
-/// RAII guard that stores the current default options upon construction, sets
-/// the current default options to the ones given to its constructor, and
-/// finally resets the options back to the original ones in the destructor.
-struct OptionsGuard {
-  /// Stores the current default options and sets them to the given ones.
-  explicit OptionsGuard(const TensorOptions& options)
-      : original_(DefaultTensorOptions::get()) {
-    DefaultTensorOptions::get() = options;
-  }
-
-  /// Restores the original default options.
-  ~OptionsGuard() {
-    DefaultTensorOptions::get() = original_;
-  }
-
-  /// Returns the original options that were in place at the time of
-  /// construction of this object.
-  const TensorOptions& original() {
-    return original_;
-  }
-
- private:
-  /// The original options that were in place at the time of construction of
-  /// this object.
-  TensorOptions original_;
-};
-
-} // namespace at
+#include <ATen/core/OptionsGuard.h>
diff --git a/aten/src/ATen/Registry.h b/aten/src/ATen/Registry.h
index 8f3caffe49154..9d8d8ff2ee840 100644
--- a/aten/src/ATen/Registry.h
+++ b/aten/src/ATen/Registry.h
@@ -1,216 +1,2 @@
 #pragma once
-
-/**
- * Simple registry implementation that uses static variables to
- * register object creators during program initialization time.
- */
-
-// NB: This Registry works poorly when you have other namespaces.
-// Make all macro invocations from inside the at namespace.
-
-#include <algorithm>
-#include <cstdio>
-#include <cstdlib>
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <unordered_map>
-#include <string>
-#include <vector>
-
-#include <ATen/core/ATenGeneral.h>
-#include <ATen/core/Backtrace.h>
-
-namespace at {
-
-template <typename KeyType>
-inline void PrintOffendingKey(const KeyType& /*key*/) {
-  printf("[key type printing not supported]\n");
-}
-
-template <>
-inline void PrintOffendingKey(const std::string& key) {
-  printf("Offending key: %s.\n", key.c_str());
-}
-
-/**
- * @brief A template class that allows one to register classes by keys.
- *
- * The keys are usually a std::string specifying the name, but can be anything that
- * can be used in a std::map.
- *
- * You should most likely not use the Registry class explicitly, but use the
- * helper macros below to declare specific registries as well as registering
- * objects.
- */
-template <class SrcType, class ObjectPtrType, class... Args>
-class AT_API Registry {
- public:
-  typedef std::function<ObjectPtrType(Args...)> Creator;
-
-  Registry() : registry_() {}
-
-  void Register(const SrcType& key, Creator creator) {
-    // The if statement below is essentially the same as the following line:
-    // CHECK_EQ(registry_.count(key), 0) << "Key " << key
-    //                                   << " registered twice.";
-    // However, CHECK_EQ depends on google logging, and since registration is
-    // carried out at static initialization time, we do not want to have an
-    // explicit dependency on glog's initialization function.
-    std::lock_guard<std::mutex> lock(register_mutex_);
-    if (registry_.count(key) != 0) {
-      printf("Key already registered.\n");
-      PrintOffendingKey(key);
-      std::exit(1);
-    }
-    registry_[key] = creator;
-  }
-
-  void Register(const SrcType& key, Creator creator, const std::string& help_msg) {
-    Register(key, creator);
-    help_message_[key] = help_msg;
-  }
-
-  inline bool Has(const SrcType& key) { return (registry_.count(key) != 0); }
-
-  ObjectPtrType Create(const SrcType& key, Args... args) {
-    if (registry_.count(key) == 0) {
-      // Returns nullptr if the key is not registered.
-      return nullptr;
-    }
-    return registry_[key](args...);
-  }
-
-  /**
-   * Returns the keys currently registered as a std::vector.
-   */
-  std::vector<SrcType> Keys() {
-    std::vector<SrcType> keys;
-    for (const auto& it : registry_) {
-      keys.push_back(it.first);
-    }
-    return keys;
-  }
-
-  const std::unordered_map<SrcType, std::string>& HelpMessage() const {
-    return help_message_;
-  }
-
-  const char* HelpMessage(const SrcType& key) const {
-    auto it = help_message_.find(key);
-    if (it == help_message_.end()) {
-      return nullptr;
-    }
-    return it->second.c_str();
-  }
-
- private:
-  std::unordered_map<SrcType, Creator> registry_;
-  std::unordered_map<SrcType, std::string> help_message_;
-  std::mutex register_mutex_;
-
-  Registry(const Registry&) = delete;
-  Registry& operator=(const Registry&) = delete;
-};
-
-template <class SrcType, class ObjectPtrType, class... Args>
-class AT_API Registerer {
- public:
-  Registerer(
-      const SrcType& key,
-      Registry<SrcType, ObjectPtrType, Args...>* registry,
-      typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
-      const std::string& help_msg = "") {
-    registry->Register(key, creator, help_msg);
-  }
-
-  template <class DerivedType>
-  static ObjectPtrType DefaultCreator(Args... args) {
-    // TODO(jiayq): old versions of NVCC does not handle make_unique well
-    // so we are forced to use a unique_ptr constructor here. Check if it is
-    // fine to use make_unique in the future.
-    // return make_unique<DerivedType>(args...);
-    return ObjectPtrType(new DerivedType(args...));
-  }
-};
-
-/**
- * AT_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
- * str and ending with a number that varies with the line.
- * Pretty much a copy from 'folly/Preprocessor.h'
- */
-#define AT_CONCATENATE_IMPL(s1, s2) s1##s2
-#define AT_CONCATENATE(s1, s2) AT_CONCATENATE_IMPL(s1, s2)
-#ifdef __COUNTER__
-#define AT_ANONYMOUS_VARIABLE(str) AT_CONCATENATE(str, __COUNTER__)
-#else
-#define AT_ANONYMOUS_VARIABLE(str) AT_CONCATENATE(str, __LINE__)
-#endif
-
-/**
- * AT_DECLARE_TYPED_REGISTRY is a macro that expands to a function
- * declaration, as well as creating a convenient typename for its corresponding
- * registerer.
- */
-#define AT_DECLARE_TYPED_REGISTRY(                                    \
-    RegistryName, SrcType, ObjectType, PtrType, ...)                     \
-  AT_API Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* RegistryName(); \
-  typedef Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>        \
-      Registerer##RegistryName; \
-  extern template class Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>;
-
-#define AT_DEFINE_TYPED_REGISTRY(                                         \
-    RegistryName, SrcType, ObjectType, PtrType, ...)                         \
-  Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* RegistryName() {    \
-    static Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* registry = \
-        new Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>();         \
-    return registry;                                                         \
-  } \
-  template class Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>;
-
-// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
-// creator with comma in its templated arguments.
-#define AT_REGISTER_TYPED_CREATOR(RegistryName, key, ...)                  \
-  namespace {                                                                 \
-  Registerer##RegistryName AT_ANONYMOUS_VARIABLE(g_##RegistryName)( \
-      key, RegistryName(), __VA_ARGS__);                                      \
-  }
-
-#define AT_REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
-  namespace {                                                                 \
-  Registerer##RegistryName AT_ANONYMOUS_VARIABLE(g_##RegistryName)( \
-      key,                                                                    \
-      RegistryName(),                                                         \
-      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                  \
-      ::at::demangle_type<__VA_ARGS__>());                                           \
-  }
-
-// AT_DECLARE_REGISTRY and AT_DEFINE_REGISTRY are hard-wired to use std::string
-// as the key
-// type, because that is the most commonly used cases.
-#define AT_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
-  AT_DECLARE_TYPED_REGISTRY(                               \
-      RegistryName, std::string, ObjectType, std::unique_ptr, __VA_ARGS__)
-
-#define AT_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
-  AT_DEFINE_TYPED_REGISTRY(                               \
-      RegistryName, std::string, ObjectType, std::unique_ptr, __VA_ARGS__)
-
-#define AT_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
-  AT_DECLARE_TYPED_REGISTRY(                                      \
-      RegistryName, std::string, ObjectType, std::shared_ptr, __VA_ARGS__)
-
-#define AT_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
-  AT_DEFINE_TYPED_REGISTRY(                                      \
-      RegistryName, std::string, ObjectType, std::shared_ptr, __VA_ARGS__)
-
-// AT_REGISTER_CREATOR and AT_REGISTER_CLASS are hard-wired to use std::string
-// as the key
-// type, because that is the most commonly used cases.
-#define AT_REGISTER_CREATOR(RegistryName, key, ...) \
-  AT_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
-
-#define AT_REGISTER_CLASS(RegistryName, key, ...) \
-  AT_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
-
-}  // namespace at
+#include <ATen/core/Registry.h>
diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
index 74d7b6d7b00f0..3f13d59b4467e 100644
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -29,7 +29,7 @@ namespace {
 // This means that we allocate a [1,0] size indices tensor and a [0] size
 // values tensor for such an empty tensor.
 SparseTensorImpl::SparseTensorImpl(at::TensorTypeId type_id, at::ScalarType scalar_type)
-    : TensorImpl(type_id, scalar_type, false)
+    : TensorImpl(type_id, scalar_type, nullptr, false)
     , size_{0}
     , sparseDims_(1)
     , denseDims_(0)
diff --git a/aten/src/ATen/Tensor.cpp b/aten/src/ATen/Tensor.cpp
index 05e98069d3e59..860a5d2ab0afe 100644
--- a/aten/src/ATen/Tensor.cpp
+++ b/aten/src/ATen/Tensor.cpp
@@ -1,4 +1,6 @@
-#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/Type.h>
+#include <ATen/Formatting.h>
 
 #include <iostream>
 
diff --git a/aten/src/ATen/TensorImpl.cpp b/aten/src/ATen/TensorImpl.cpp
index a520f4a5457d6..f4ecaf0b6253f 100644
--- a/aten/src/ATen/TensorImpl.cpp
+++ b/aten/src/ATen/TensorImpl.cpp
@@ -1,20 +1,14 @@
 #include <ATen/TensorImpl.h>
 
-#include "ATen/Context.h"
-#include <ATen/Tensor.h>
 #include <ATen/core/optional.h>
-#include <ATen/Context.h>
 #include <ATen/core/Backend.h>
 #include <ATen/core/WrapDimMinimal.h>
+#include <ATen/core/LegacyTypeDispatch.h>
 
-#include <ATen/detail/VariableHooksInterface.h>
+#include <ATen/core/VariableHooksInterface.h>
 
 namespace at {
 
-Type& TensorImpl::type() const {
-  return at::getMaybeVariableType(this);
-}
-
 Tensor& TensorImpl::grad() {
   AT_ERROR("grad is not implemented for Tensor");
 }
@@ -23,35 +17,12 @@ const Tensor& TensorImpl::grad() const {
   AT_ERROR("grad is not implemented for Tensor");
 }
 
-Tensor TensorImpl::detach() const {
-  AT_ERROR("detach is not implemented for Tensor");
-}
-
-void TensorImpl::backward(
-    at::optional<Tensor> gradient,
-    bool keep_graph,
-    bool create_graph) {
-  AT_ERROR("backward is not implemented for Tensor");
-}
-
-void TensorImpl::set_data(Tensor new_data) {
-  AT_ERROR("set_type is not implemented for Tensor");
-}
-
-void Tensor::backward(
-    at::optional<Tensor> gradient,
-    bool keep_graph,
-    bool create_graph) {
-  tensor_impl_->backward(std::move(gradient), keep_graph, create_graph);
-}
-
-TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, bool is_variable)
+TensorImpl::TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable)
     : TensorImpl({}, type_id, scalar_type, is_variable) {
   // UndefinedTensors and SparseTensors don't have storages.
   if (type_id != UndefinedTensorId() && scalar_type != ScalarType::Undefined
       && type_id != SparseCPUTensorId() && type_id != SparseCUDATensorId()) {
-    auto type = &globalContext().getNonVariableType(tensorTypeIdToBackend(type_id), scalar_type);
-    storage_ = type->storage(true);
+    storage_ = Storage(scalar_type, 0, allocator, true);
   }
 }
 
diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h
index f41eb165279c0..e0a649a49b6cc 100644
--- a/aten/src/ATen/TensorImpl.h
+++ b/aten/src/ATen/TensorImpl.h
@@ -7,6 +7,8 @@
 #include "ATen/core/optional.h"
 #include "ATen/core/TensorTypeId.h"
 #include "ATen/core/TensorTypeIdRegistration.h"
+#include "ATen/core/LegacyTypeDispatch.h"
+#include "ATen/core/Backend.h"
 
 struct THTensor;
 
@@ -20,15 +22,18 @@ struct Tensor;
 namespace at {
 struct AT_API TensorImpl : public c10::intrusive_ptr_target {
   TensorImpl() = delete;
-  TensorImpl(TensorTypeId type_id, ScalarType scalar_type, bool is_variable);
+  TensorImpl(TensorTypeId type_id, ScalarType scalar_type, Allocator *allocator, bool is_variable);
   TensorImpl(Storage&& storage, TensorTypeId type_id, bool is_variable);
 
   virtual void release_resources() override;
 
-  // The implementation of this method will have to be hoisted out and
-  // hooked in, so that Caffe2 doesn't need to know about Context
-  // TODO: This really really needs to be inlined.
-  Type & type() const;
+  Type & type() const {
+    // NB: It's valid to use getTypeRaw here, because the TensorImpl
+    // could not have been created without initializing the Type first.
+    // TODO: This is not actually true via the Caffe2 codepath!  Make
+    // it so.
+    return *globalLegacyTypeDispatch().getTypeRaw(tensorTypeIdToBackend(type_id()), scalar_type(), is_variable());
+  }
 
   TensorTypeId type_id() const { return type_id_; }
   virtual IntList sizes() const;
@@ -85,18 +90,6 @@ struct AT_API TensorImpl : public c10::intrusive_ptr_target {
   virtual Tensor& grad();
   virtual const Tensor& grad() const;
 
-  virtual Tensor detach() const;
-  virtual void detach_() {
-    AT_ERROR("detach_ is not implemented for Tensor");
-  }
-
-  virtual void backward(
-      at::optional<Tensor> gradient,
-      bool keep_graph,
-      bool create_graph);
-
-  virtual void set_data(Tensor new_data);
-
   // TODO: make these protected
   // Note: storage->size() may be greater than the recorded size
   // of a tensor
diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/UndefinedTensor.cpp
index f50a4e71da9ca..956c70b9f178a 100644
--- a/aten/src/ATen/UndefinedTensor.cpp
+++ b/aten/src/ATen/UndefinedTensor.cpp
@@ -5,7 +5,7 @@ namespace at {
 
 // should this use the globalContext?  Can it get a context passed in somehow?
 UndefinedTensor::UndefinedTensor()
-: TensorImpl(UndefinedTensorId(), ScalarType::Undefined,  /* is variable */ false) {
+: TensorImpl(UndefinedTensorId(), ScalarType::Undefined, nullptr, /* is variable */ false) {
 }
 
 IntList UndefinedTensor::sizes() const {
diff --git a/aten/src/ATen/core/Half.h b/aten/src/ATen/core/Half.h
index d381d1acc5d81..c306fcd6b92b7 100644
--- a/aten/src/ATen/core/Half.h
+++ b/aten/src/ATen/core/Half.h
@@ -10,6 +10,7 @@
 /// intrinsics directly on the Half type from device code.
 
 #include <ATen/core/Macros.h>
+#include <ATen/core/C++17.h>
 
 #include <cmath>
 #include <cstdint>
@@ -19,6 +20,7 @@
 #include <string>
 #include <utility>
 #include <sstream>
+#include <complex>
 
 #ifdef __CUDACC__
 #include <cuda_fp16.h>
@@ -74,18 +76,66 @@ struct alignas(4) ComplexHalf {
   Half real_;
   Half imag_;
   ComplexHalf() = default;
+  Half real() const { return real_; }
+  Half imag() const { return imag_; }
+  inline ComplexHalf(std::complex<float> value)
+    : real_(value.real()), imag_(value.imag()) {}
+  inline operator std::complex<float>() const {
+    return {real_, imag_};
+  }
+};
+
+template <typename T>
+struct is_complex_t : public std::false_type {};
+
+template <typename T>
+struct is_complex_t<std::complex<T>> : public std::true_type {};
+
+template <>
+struct is_complex_t<ComplexHalf> : public std::true_type {};
+
+// Extract double from std::complex<double>; is identity otherwise
+// TODO: Write in more idiomatic C++17
+template <typename T> struct scalar_value_type                  { using type = T; };
+template <typename T> struct scalar_value_type<std::complex<T>> { using type = T; };
+template <>           struct scalar_value_type<ComplexHalf>     { using type = Half; };
+
+// The old implementation of Converter as a function made nvcc's head explode
+// when we added std::complex on top of the specializations for CUDA-only types
+// like __half, so I rewrote it as a templated class (so, no more overloads,
+// just (partial) specialization).
+
+template <typename To, typename From, typename Enable = void>
+struct Converter {
+  To operator()(From f) {
+    return static_cast<To>(f);
+  }
 };
 
 template <typename To, typename From>
-To convert(From f) {
-  return static_cast<To>(f);
+To convert(From from) {
+  return Converter<To, From>()(from);
 }
 
+template <typename To, typename FromV>
+struct Converter<
+  To, std::complex<FromV>,
+  typename std::enable_if<
+    c10::guts::negation<
+      is_complex_t<To>
+    >::value
+  >::type
+> {
+  To operator()(std::complex<FromV> f) {
+    return static_cast<To>(f.real());
+  }
+};
+
 // skip isnan and isinf check for integral types
 template <typename To, typename From>
 typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
     From f) {
-  using limit = std::numeric_limits<To>;
+  using limit = std::numeric_limits<typename scalar_value_type<To>::type>;
   if (!limit::is_signed && std::numeric_limits<From>::is_signed) {
     // allow for negative numbers to wrap using two's complement arithmetic.
     // For example, with uint8, this allows for `a - b` to be treated as
@@ -97,9 +147,9 @@ typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
 }
 
 template <typename To, typename From>
-typename std::enable_if<!std::is_integral<From>::value, bool>::type overflows(
+typename std::enable_if<std::is_floating_point<From>::value, bool>::type overflows(
     From f) {
-  using limit = std::numeric_limits<To>;
+  using limit = std::numeric_limits<typename scalar_value_type<To>::type>;
   if (limit::has_infinity && std::isinf(static_cast<double>(f))) {
     return false;
   }
@@ -109,6 +159,23 @@ typename std::enable_if<!std::is_integral<From>::value, bool>::type overflows(
   return f < limit::lowest() || f > limit::max();
 }
 
+
+template <typename To, typename From>
+typename std::enable_if<is_complex_t<From>::value, bool>::type overflows(
+    From f) {
+  // casts from complex to real are considered to overflow if the
+  // imaginary component is non-zero
+  if (!is_complex_t<To>::value && f.imag() != 0) {
+    return true;
+  }
+  // Check for overflow componentwise
+  // (Technically, the imag overflow check is guaranteed to be false
+  // when !is_complex_t<To>, but any optimizer worth its salt will be
+  // able to figure it out.)
+  return overflows<typename scalar_value_type<To>::type, typename From::value_type>(f.real()) ||
+         overflows<typename scalar_value_type<To>::type, typename From::value_type>(f.imag());
+}
+
 template <typename To, typename From>
 To checked_convert(From f, const char* name) {
   if (overflows<To, From>(f)) {
diff --git a/aten/src/ATen/core/LegacyTypeDispatch.cpp b/aten/src/ATen/core/LegacyTypeDispatch.cpp
new file mode 100644
index 0000000000000..6835399bfe2ca
--- /dev/null
+++ b/aten/src/ATen/core/LegacyTypeDispatch.cpp
@@ -0,0 +1,27 @@
+#include <ATen/core/LegacyTypeDispatch.h>
+
+namespace at {
+
+// TODO: This could be bad juju if someone calls globalContext() in the
+// destructor of an object with static lifetime.
+LegacyTypeDispatch & globalLegacyTypeDispatch() {
+  static LegacyTypeDispatch singleton;
+  return singleton;
+}
+
+AT_DEFINE_REGISTRY(LegacyTypeInitRegistry, LegacyTypeInitInterface, LegacyTypeInitArgs)
+
+const LegacyTypeInitInterface& getLegacyTypeInit() {
+  static std::unique_ptr<LegacyTypeInitInterface> legacy_type_init;
+  static std::once_flag once;
+  std::call_once(once, [] {
+    legacy_type_init = LegacyTypeInitRegistry()->Create("LegacyTypeInit", LegacyTypeInitArgs{});
+    if (!legacy_type_init) {
+      legacy_type_init =
+          std::unique_ptr<LegacyTypeInitInterface>(new LegacyTypeInitInterface());
+    }
+  });
+  return *legacy_type_init;
+}
+
+}
diff --git a/aten/src/ATen/core/LegacyTypeDispatch.h b/aten/src/ATen/core/LegacyTypeDispatch.h
new file mode 100644
index 0000000000000..578e02e739d0d
--- /dev/null
+++ b/aten/src/ATen/core/LegacyTypeDispatch.h
@@ -0,0 +1,155 @@
+#pragma once
+
+// The legacy mechanism for dispatching operators in ATen is a Type
+// object, which is essentially a giant virtual dispatch table
+// for every operation we support dynamically dispatching over.
+//
+// We intend to deprecate this design for a more extensible one
+// that permits addition of extra operators *out-of-band*.  However,
+// for the time being, it's the only mechanism which works for
+// dispatching PyTorch operators, so we are supporting it for now.
+//
+// The use of Type in ATen/core poses another problem: on a
+// mobile build, we don't want to assume that Type is available.
+// But all methods on Tensor which route to PyTorch operators
+// need to somehow *get* a Type, and then do a virtual call on it.
+// How are we going to get the Type?  Why, by another indirection!
+//
+// This registry is the mechanism for getting a concrete Type.
+// For a regular build, we register all types here; for a
+// mobile build, there are no registrations and instead we
+// return a stub which errors for all functions.
+//
+// NB: We don't use Registry for this, because we don't want to
+// pay for a hash table lookup every time we do an operation.
+
+#include <ATen/core/VariableHooksInterface.h>
+#include <ATen/core/Backend.h>
+#include <ATen/core/ScalarType.h>
+#include <ATen/core/Error.h>
+
+namespace at {
+
+struct AT_CORE_API LegacyTypeInitInterface {
+  virtual ~LegacyTypeInitInterface() {}
+  virtual void initCPU() const {
+    AT_ERROR("cannot use CPU without ATen library");
+  }
+  virtual void initCUDA() const {
+    AT_ERROR("cannot use CUDA without ATen CUDA library");
+  }
+  virtual void initComplex() const {
+    AT_ERROR("cannot use complex without ATen Complex library");
+  }
+};
+struct AT_CORE_API LegacyTypeInitArgs {};
+AT_DECLARE_REGISTRY(LegacyTypeInitRegistry, LegacyTypeInitInterface, LegacyTypeInitArgs);
+#define REGISTER_LEGACY_TYPE_INIT(clsname) AT_REGISTER_CLASS(LegacyTypeInitRegistry, clsname, clsname)
+
+AT_CORE_API const LegacyTypeInitInterface& getLegacyTypeInit();
+
+struct Type;
+
+struct AT_CORE_API LegacyTypeDeleter {
+  using TypeDeleterFun = void(Type*);
+  TypeDeleterFun *fn_ = nullptr;
+  LegacyTypeDeleter() {}
+  /* implicit */ LegacyTypeDeleter(TypeDeleterFun *fn) : fn_(fn) {}
+  void operator()(Type * ptr) {
+    if (fn_) {
+      (*fn_)(ptr);
+    }
+  }
+};
+
+class AT_CORE_API LegacyTypeDispatch {
+public:
+  using TypeUniquePtr = std::unique_ptr<Type, LegacyTypeDeleter>;
+  // WARNING: This function has the precondition that you have
+  // initialized the type you want to call.  This initialization
+  // step is generally done by Context, or assumed because you
+  // have a Tensor and thus the Type of that Tensor must already
+  // be initialized.
+  Type* getNonVariableTypeRaw(Backend p, ScalarType s) {
+    return type_registry[static_cast<int>(p)][static_cast<int>(s)].get();
+  }
+  Type * getNonVariableTypeOpt(Backend p, ScalarType s) {
+    if (p != Backend::Undefined) {
+      initForDeviceType(backendToDeviceType(p));
+      initForScalarType(s);
+    }
+    auto type = getNonVariableTypeRaw(p, s);
+
+    if(!type) {
+      // there is only a single Undefined Type.
+      if (p == Backend::Undefined || s == ScalarType::Undefined) {
+        return getNonVariableTypeRaw(Backend::Undefined, ScalarType::Undefined);
+      }
+    }
+
+    return type;
+  }
+
+  Type & getNonVariableType(Backend p, ScalarType s) {
+    auto* type = getNonVariableTypeOpt(p, s);
+    if (!type) AT_ERROR(toString(p), toString(s), "Type is not enabled.");
+    return *type;
+  }
+
+  Type* getTypeRaw(Backend p, ScalarType s, bool is_variable) {
+    auto baseType = getNonVariableTypeRaw(p, s);
+    if (is_variable) {
+      return &detail::getVariableHooks().getVariableTypeFromBaseType(*baseType);
+    } else {
+      return baseType;
+    }
+  }
+  Type & getVariableType(Backend p, ScalarType s) {
+    auto& baseType = getNonVariableType(p, s);
+    return detail::getVariableHooks().getVariableTypeFromBaseType(baseType);
+  }
+  Type & getType(Backend p, ScalarType s, bool is_variable) {
+    if (is_variable) {
+      return getVariableType(p, s);
+    } else {
+      return getNonVariableType(p, s);
+    }
+  }
+  void registerType(Backend b, ScalarType s, TypeUniquePtr&& t) {
+    type_registry[static_cast<int>(b)][static_cast<int>(s)] = std::move(t);
+    detail::getVariableHooks().registerVariableTypeFor(this, b, s);
+  }
+private:
+  void initForDeviceType(DeviceType p) {
+    static std::once_flag cpu_once;
+    static std::once_flag cuda_once;
+    if (p == DeviceType::CPU) {
+      std::call_once(cpu_once, [] {
+        getLegacyTypeInit().initCPU();
+      });
+    } else if (p == DeviceType::CUDA) {
+      std::call_once(cuda_once, [] {
+        getLegacyTypeInit().initCUDA();
+      });
+    }
+  }
+  void initForScalarType(ScalarType s) {
+    static std::once_flag once;
+    // Only complex may need initialization
+    if (isComplexType(s)) {
+      std::call_once(once, [] {
+        getLegacyTypeInit().initComplex();
+      });
+    }
+  }
+
+  // NB: type_registry has nullptr for all CUDA backends until
+  // CUDA initialization has occurred
+  TypeUniquePtr type_registry
+    [static_cast<int>(Backend::NumOptions)]
+    [static_cast<int>(ScalarType::NumOptions)];
+};
+
+AT_CORE_API LegacyTypeDispatch & globalLegacyTypeDispatch();
+
+} // namespace at
diff --git a/aten/src/ATen/core/Macros.h b/aten/src/ATen/core/Macros.h
index 1ada6d2b5ffb1..67efa523ac2bb 100644
--- a/aten/src/ATen/core/Macros.h
+++ b/aten/src/ATen/core/Macros.h
@@ -45,6 +45,21 @@
   classname(const classname&) = delete;       \
   classname& operator=(const classname&) = delete
 
+
+#if defined(__ANDROID__)
+#define AT_ANDROID 1
+#define AT_MOBILE 1
+#elif (defined(__APPLE__) &&                                            \
+       (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
+#define AT_IOS 1
+#define AT_MOBILE 1
+#elif (defined(__APPLE__) && TARGET_OS_MAC)
+#define AT_IOS 1
+#define AT_MOBILE 0
+#else
+#define AT_MOBILE 0
+#endif // ANDROID / IOS / MACOS
+
 namespace at {
 inline int stoi(const std::string& str) {
 #if defined(__ANDROID__)
diff --git a/aten/src/ATen/OptionsGuard.cpp b/aten/src/ATen/core/OptionsGuard.cpp
similarity index 52%
rename from aten/src/ATen/OptionsGuard.cpp
rename to aten/src/ATen/core/OptionsGuard.cpp
index 6716fd68fef2d..29e1fb9d0e3b7 100644
--- a/aten/src/ATen/OptionsGuard.cpp
+++ b/aten/src/ATen/core/OptionsGuard.cpp
@@ -1,8 +1,10 @@
-#include <ATen/OptionsGuard.h>
+#include <ATen/core/OptionsGuard.h>
 #include <ATen/core/optional.h>
 
 namespace at {
 
+#if !AT_MOBILE && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
+
 thread_local at::optional<TensorOptions> DefaultTensorOptions::options_;
 
 TensorOptions& DefaultTensorOptions::get() {
@@ -13,4 +15,14 @@ TensorOptions& DefaultTensorOptions::get() {
   return *options_;
 }
 
+#else
+
+TensorOptions DefaultTensorOptions::options_(/*use_thread_local_default_options=*/false);
+
+const TensorOptions& DefaultTensorOptions::get() {
+  return options_;
+}
+
+#endif
+
 } // namespace at
diff --git a/aten/src/ATen/core/OptionsGuard.h b/aten/src/ATen/core/OptionsGuard.h
new file mode 100644
index 0000000000000..b359638d53a61
--- /dev/null
+++ b/aten/src/ATen/core/OptionsGuard.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#include <ATen/core/Device.h>
+#include <ATen/core/Layout.h>
+#include <ATen/core/ScalarType.h>
+#include <ATen/core/TensorOptions.h>
+#include <ATen/core/optional.h>
+#include <ATen/core/Macros.h>
+
+namespace at {
+
+// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting,
+// thread_local is not supported.  In that case, we don't provide
+// an OptionsGuard; and force you to pass around options manually.
+#if !AT_MOBILE && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
+
+/// A wrapper over a thread local TensorOptions instance.
+struct DefaultTensorOptions {
+  /// Returns the current thread local default options.
+  /// Defined in OptionsGuard.cpp because we can't use optional in headers, due
+  /// to Windows and other compilers.
+  /// TODO: The inability to use optional in headers is no longer true
+  AT_API static TensorOptions& get();
+
+ private:
+  /// This is an optional because of compiler bugs that mis-initialize static
+  /// thread local variables. The workaround is lazy initialization, i.e.
+  /// `DefaultTensorOptions::get()` will initialize the `options_` to a proper
+  /// value upon first invocation.
+  /// https://gcc.gnu.org/ml/gcc-bugs/2013-12/msg00026.html
+  static thread_local at::optional<TensorOptions> options_;
+};
+
+/// RAII guard that stores the current default options upon construction, sets
+/// the current default options to the ones given to its constructor, and
+/// finally resets the options back to the original ones in the destructor.
+///
+/// You should NOT use OptionsGuard for internal code in ATen; it is reserved
+/// for end users.
+struct OptionsGuard {
+  /// Stores the current default options and sets them to the given ones.
+  explicit OptionsGuard(const TensorOptions& options)
+      : original_(DefaultTensorOptions::get()) {
+    DefaultTensorOptions::get() = options;
+  }
+
+  /// Restores the original default options.
+  ~OptionsGuard() {
+    DefaultTensorOptions::get() = original_;
+  }
+
+  /// Returns the original options that were in place at the time of
+  /// construction of this object.
+  const TensorOptions& original() {
+    return original_;
+  }
+
+ private:
+  /// The original options that were in place at the time of construction of
+  /// this object.
+  TensorOptions original_;
+};
+
+#else // AT_MOBILE
+
+struct DefaultTensorOptions {
+  AT_API static const TensorOptions& get();
+private:
+  static TensorOptions options_;
+};
+
+template<typename T = void>
+struct OptionsGuard {
+  OptionsGuard() {
+      static_assert(!std::is_same<T, void>::value,
+                    "OptionsGuard is not supported on mobile; please pass around TensorOptions manually");
+  }
+};
+
+#endif
+
+} // namespace at
diff --git a/aten/src/ATen/core/Registry.h b/aten/src/ATen/core/Registry.h
new file mode 100644
index 0000000000000..8f3caffe49154
--- /dev/null
+++ b/aten/src/ATen/core/Registry.h
@@ -0,0 +1,216 @@
+#pragma once
+
+/**
+ * Simple registry implementation that uses static variables to
+ * register object creators during program initialization time.
+ */
+
+// NB: This Registry works poorly when you have other namespaces.
+// Make all macro invocations from inside the at namespace.
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <string>
+#include <vector>
+
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/Backtrace.h>
+
+namespace at {
+
+template <typename KeyType>
+inline void PrintOffendingKey(const KeyType& /*key*/) {
+  printf("[key type printing not supported]\n");
+}
+
+template <>
+inline void PrintOffendingKey(const std::string& key) {
+  printf("Offending key: %s.\n", key.c_str());
+}
+
+/**
+ * @brief A template class that allows one to register classes by keys.
+ *
+ * The keys are usually a std::string specifying the name, but can be anything that
+ * can be used in a std::map.
+ *
+ * You should most likely not use the Registry class explicitly, but use the
+ * helper macros below to declare specific registries as well as registering
+ * objects.
+ */
+template <class SrcType, class ObjectPtrType, class... Args>
+class AT_API Registry {
+ public:
+  typedef std::function<ObjectPtrType(Args...)> Creator;
+
+  Registry() : registry_() {}
+
+  void Register(const SrcType& key, Creator creator) {
+    // The if statement below is essentially the same as the following line:
+    // CHECK_EQ(registry_.count(key), 0) << "Key " << key
+    //                                   << " registered twice.";
+    // However, CHECK_EQ depends on google logging, and since registration is
+    // carried out at static initialization time, we do not want to have an
+    // explicit dependency on glog's initialization function.
+    std::lock_guard<std::mutex> lock(register_mutex_);
+    if (registry_.count(key) != 0) {
+      printf("Key already registered.\n");
+      PrintOffendingKey(key);
+      std::exit(1);
+    }
+    registry_[key] = creator;
+  }
+
+  void Register(const SrcType& key, Creator creator, const std::string& help_msg) {
+    Register(key, creator);
+    help_message_[key] = help_msg;
+  }
+
+  inline bool Has(const SrcType& key) { return (registry_.count(key) != 0); }
+
+  ObjectPtrType Create(const SrcType& key, Args... args) {
+    if (registry_.count(key) == 0) {
+      // Returns nullptr if the key is not registered.
+      return nullptr;
+    }
+    return registry_[key](args...);
+  }
+
+  /**
+   * Returns the keys currently registered as a std::vector.
+   */
+  std::vector<SrcType> Keys() {
+    std::vector<SrcType> keys;
+    for (const auto& it : registry_) {
+      keys.push_back(it.first);
+    }
+    return keys;
+  }
+
+  const std::unordered_map<SrcType, std::string>& HelpMessage() const {
+    return help_message_;
+  }
+
+  const char* HelpMessage(const SrcType& key) const {
+    auto it = help_message_.find(key);
+    if (it == help_message_.end()) {
+      return nullptr;
+    }
+    return it->second.c_str();
+  }
+
+ private:
+  std::unordered_map<SrcType, Creator> registry_;
+  std::unordered_map<SrcType, std::string> help_message_;
+  std::mutex register_mutex_;
+
+  Registry(const Registry&) = delete;
+  Registry& operator=(const Registry&) = delete;
+};
+
+template <class SrcType, class ObjectPtrType, class... Args>
+class AT_API Registerer {
+ public:
+  Registerer(
+      const SrcType& key,
+      Registry<SrcType, ObjectPtrType, Args...>* registry,
+      typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
+      const std::string& help_msg = "") {
+    registry->Register(key, creator, help_msg);
+  }
+
+  template <class DerivedType>
+  static ObjectPtrType DefaultCreator(Args... args) {
+    // TODO(jiayq): old versions of NVCC does not handle make_unique well
+    // so we are forced to use a unique_ptr constructor here. Check if it is
+    // fine to use make_unique in the future.
+    // return make_unique<DerivedType>(args...);
+    return ObjectPtrType(new DerivedType(args...));
+  }
+};
+
+/**
+ * AT_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
+ * str and ending with a number that varies with the line.
+ * Pretty much a copy from 'folly/Preprocessor.h'
+ */
+#define AT_CONCATENATE_IMPL(s1, s2) s1##s2
+#define AT_CONCATENATE(s1, s2) AT_CONCATENATE_IMPL(s1, s2)
+#ifdef __COUNTER__
+#define AT_ANONYMOUS_VARIABLE(str) AT_CONCATENATE(str, __COUNTER__)
+#else
+#define AT_ANONYMOUS_VARIABLE(str) AT_CONCATENATE(str, __LINE__)
+#endif
+
+/**
+ * AT_DECLARE_TYPED_REGISTRY is a macro that expands to a function
+ * declaration, as well as creating a convenient typename for its corresponding
+ * registerer.
+ */
+#define AT_DECLARE_TYPED_REGISTRY(                                    \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                     \
+  AT_API Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* RegistryName(); \
+  typedef Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>        \
+      Registerer##RegistryName; \
+  extern template class Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>;
+
+#define AT_DEFINE_TYPED_REGISTRY(                                         \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                         \
+  Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* RegistryName() {    \
+    static Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>* registry = \
+        new Registry<SrcType, PtrType<ObjectType>, __VA_ARGS__>();         \
+    return registry;                                                         \
+  } \
+  template class Registerer<SrcType, PtrType<ObjectType>, __VA_ARGS__>;
+
+// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
+// creator with comma in its templated arguments.
+#define AT_REGISTER_TYPED_CREATOR(RegistryName, key, ...)                  \
+  namespace {                                                                 \
+  Registerer##RegistryName AT_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key, RegistryName(), __VA_ARGS__);                                      \
+  }
+
+#define AT_REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
+  namespace {                                                                 \
+  Registerer##RegistryName AT_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key,                                                                    \
+      RegistryName(),                                                         \
+      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                  \
+      ::at::demangle_type<__VA_ARGS__>());                                           \
+  }
+
+// AT_DECLARE_REGISTRY and AT_DEFINE_REGISTRY are hard-wired to use std::string
+// as the key
+// type, because that is the most commonly used cases.
+#define AT_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
+  AT_DECLARE_TYPED_REGISTRY(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, __VA_ARGS__)
+
+#define AT_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
+  AT_DEFINE_TYPED_REGISTRY(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, __VA_ARGS__)
+
+#define AT_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
+  AT_DECLARE_TYPED_REGISTRY(                                      \
+      RegistryName, std::string, ObjectType, std::shared_ptr, __VA_ARGS__)
+
+#define AT_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
+  AT_DEFINE_TYPED_REGISTRY(                                      \
+      RegistryName, std::string, ObjectType, std::shared_ptr, __VA_ARGS__)
+
+// AT_REGISTER_CREATOR and AT_REGISTER_CLASS are hard-wired to use std::string
+// as the key
+// type, because that is the most commonly used cases.
+#define AT_REGISTER_CREATOR(RegistryName, key, ...) \
+  AT_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
+
+#define AT_REGISTER_CLASS(RegistryName, key, ...) \
+  AT_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
+
+}  // namespace at
diff --git a/aten/src/ATen/core/Scalar.cpp b/aten/src/ATen/core/Scalar.cpp
index 4916e39c59b93..7bdc770f7a293 100644
--- a/aten/src/ATen/core/Scalar.cpp
+++ b/aten/src/ATen/core/Scalar.cpp
@@ -3,11 +3,13 @@
 namespace at {
 
 Scalar Scalar::operator-() const {
- if (isFloatingPoint()) {
-   return Scalar(-v.d);
- } else {
-   return Scalar(-v.i);
- }
+  if (isFloatingPoint()) {
+    return Scalar(-v.d);
+  } else if (isComplex()) {
+    return Scalar(std::complex<double>(-v.z[0], -v.z[1]));
+  } else {
+    return Scalar(-v.i);
+  }
 }
 
 }  // namespace at
diff --git a/aten/src/ATen/core/Scalar.h b/aten/src/ATen/core/Scalar.h
index 0e55688377bc7..35c4b538336ae 100644
--- a/aten/src/ATen/core/Scalar.h
+++ b/aten/src/ATen/core/Scalar.h
@@ -23,21 +23,39 @@ class AT_API Scalar {
   : tag(Tag::HAS_##member) { \
     v . member = convert<decltype(v.member),type>(vv); \
   }
+  // We can't set v in the initializer list using the
+  // syntax v{ .member = ... } because it doesn't work on MSVC
 
   AT_FORALL_SCALAR_TYPES(DEFINE_IMPLICIT_CTOR)
 
 #undef DEFINE_IMPLICIT_CTOR
 
+#define DEFINE_IMPLICIT_COMPLEX_CTOR(type,name,member) \
+  Scalar(type vv) \
+  : tag(Tag::HAS_##member) { \
+    v . member[0] = convert<double>(vv.real()); \
+    v . member[1] = convert<double>(vv.imag()); \
+  }
+
+  DEFINE_IMPLICIT_COMPLEX_CTOR(at::ComplexHalf,ComplexHalf,z)
+  DEFINE_IMPLICIT_COMPLEX_CTOR(std::complex<float>,ComplexFloat,z)
+  DEFINE_IMPLICIT_COMPLEX_CTOR(std::complex<double>,ComplexDouble,z)
+
+#undef DEFINE_IMPLICIT_COMPLEX_CTOR
+
 #define DEFINE_ACCESSOR(type,name,member) \
   type to##name () const { \
     if (Tag::HAS_d == tag) { \
       return checked_convert<type, double>(v.d, #type); \
+    } else if (Tag::HAS_z == tag) { \
+      return checked_convert<type, std::complex<double>>({v.z[0], v.z[1]}, #type); \
     } else { \
       return checked_convert<type, int64_t>(v.i, #type); \
     } \
   }
 
-  AT_FORALL_SCALAR_TYPES(DEFINE_ACCESSOR)
+  // TODO: Support ComplexHalf accessor
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_ACCESSOR)
 
   //also support scalar.to<int64_t>();
   template<typename T>
@@ -50,15 +68,22 @@ class AT_API Scalar {
   bool isIntegral() const {
     return Tag::HAS_i == tag;
   }
+  bool isComplex() const {
+    return Tag::HAS_z == tag;
+  }
 
   Scalar operator-() const;
 
 private:
-  enum class Tag { HAS_d, HAS_i };
+  enum class Tag { HAS_d, HAS_i, HAS_z };
   Tag tag;
   union {
     double d;
-    int64_t i = 0;
+    int64_t i;
+    // Can't do put std::complex in the union, because it triggers
+    // an nvcc bug:
+    //    error: designator may not specify a non-POD subobject
+    double z[2];
   } v;
   friend struct Type;
 };
diff --git a/aten/src/ATen/core/ScalarType.h b/aten/src/ATen/core/ScalarType.h
index 7c8f124c513a4..b5e1a47646d7d 100644
--- a/aten/src/ATen/core/ScalarType.h
+++ b/aten/src/ATen/core/ScalarType.h
@@ -25,6 +25,21 @@ _(at::ComplexHalf,ComplexHalf,z)        /* 8 */ \
 _(std::complex<float>,ComplexFloat,z)   /* 9 */ \
 _(std::complex<double>,ComplexDouble,z) /* 10 */
 
+// If you want to support ComplexHalf for real, replace occurrences
+// of this macro with AT_FORALL_SCALAR_TYPES_WITH_COMPLEX.  But
+// beware: convert() doesn't work for all the conversions you need...
+#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(_) \
+_(uint8_t,Byte,i)  \
+_(int8_t,Char,i)   \
+_(int16_t,Short,i) \
+_(int,Int,i)       \
+_(int64_t,Long,i)  \
+_(at::Half,Half,d) \
+_(float,Float,d)   \
+_(double,Double,d) \
+_(std::complex<float>,ComplexFloat,z) \
+_(std::complex<double>,ComplexDouble,z)
+
 #define AT_FORALL_SCALAR_TYPES(_) \
 _(uint8_t,Byte,i)  \
 _(int8_t,Char,i)   \
diff --git a/aten/src/ATen/coreTensorOptions.cpp b/aten/src/ATen/core/TensorOptions.cpp
similarity index 90%
rename from aten/src/ATen/coreTensorOptions.cpp
rename to aten/src/ATen/core/TensorOptions.cpp
index b1f5df07bfd52..ddf7a6bbc236b 100644
--- a/aten/src/ATen/coreTensorOptions.cpp
+++ b/aten/src/ATen/core/TensorOptions.cpp
@@ -1,9 +1,9 @@
 #include <ATen/core/TensorOptions.h>
 
-#include <ATen/Device.h>
+#include <ATen/core/Device.h>
 #include <ATen/core/Layout.h>
-#include <ATen/OptionsGuard.h>
-#include <ATen/ScalarType.h>
+#include <ATen/core/OptionsGuard.h>
+#include <ATen/core/ScalarType.h>
 #include <ATen/core/optional.h>
 #include <ATen/core/ScalarType.h>
 
diff --git a/aten/src/ATen/core/UniqueVoidPtr.h b/aten/src/ATen/core/UniqueVoidPtr.h
index 405d286308e08..a7c9d6119bfcd 100644
--- a/aten/src/ATen/core/UniqueVoidPtr.h
+++ b/aten/src/ATen/core/UniqueVoidPtr.h
@@ -1,3 +1,4 @@
+#pragma once
 #include <memory>
 
 #include <ATen/core/Macros.h>
diff --git a/aten/src/ATen/detail/VariableHooksInterface.cpp b/aten/src/ATen/core/VariableHooksInterface.cpp
similarity index 94%
rename from aten/src/ATen/detail/VariableHooksInterface.cpp
rename to aten/src/ATen/core/VariableHooksInterface.cpp
index 8569052cac363..3728114492e53 100644
--- a/aten/src/ATen/detail/VariableHooksInterface.cpp
+++ b/aten/src/ATen/core/VariableHooksInterface.cpp
@@ -1,4 +1,4 @@
-#include <ATen/detail/VariableHooksInterface.h>
+#include <ATen/core/VariableHooksInterface.h>
 
 namespace at {
 
diff --git a/aten/src/ATen/detail/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h
similarity index 85%
rename from aten/src/ATen/detail/VariableHooksInterface.h
rename to aten/src/ATen/core/VariableHooksInterface.h
index 8b4a9a90770a2..09c972255ea6f 100644
--- a/aten/src/ATen/detail/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@@ -1,11 +1,12 @@
 #pragma once
 
-#include <ATen/Registry.h>
-#include <ATen/ScalarType.h>
-#include <ATen/Type.h>
+#include <ATen/core/Registry.h>
+#include <ATen/core/ScalarType.h>
+#include <ATen/core/Backend.h>
 
 namespace at {
-  class Context;
+  class LegacyTypeDispatch;
+  struct Type;
 }
 
 // NB: Registry class not actually in the namespace detail, due to limitations
@@ -29,7 +30,7 @@ struct AT_API VariableHooksInterface {
     AT_ERROR("cannot getVariableTypeFromBaseType without libtorch");
   }
 
-  virtual void registerVariableTypeFor(Context*, Backend backend, ScalarType scalar_type) const {
+  virtual void registerVariableTypeFor(LegacyTypeDispatch*, Backend backend, ScalarType scalar_type) const {
     // no-op if Variable not available; it'll get handled (if at all) when
     // libtorch.so gets loaded
   }
diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h
new file mode 100644
index 0000000000000..2ca9a7f685110
--- /dev/null
+++ b/aten/src/ATen/core/context_base.h
@@ -0,0 +1,186 @@
+#pragma once
+
+#include <array>
+#include <cstdlib>
+#include <ctime>
+#include <memory>
+#include <unordered_map>
+
+#include <ATen/core/DeviceType.h>
+#include <ATen/core/Error.h>
+#include <ATen/core/UniqueVoidPtr.h>
+#include <ATen/core/typeid.h>
+
+namespace caffe2 {
+class Event;
+class DeviceOption;
+
+} // namespace caffe2
+namespace at {
+
+class BaseContext;
+
+/* BaseStaticContext defines the interface for static context, which contains
+   functions that are invoked statically before in Tensor class, e.g. New,
+   We will merge this with Allocator later.
+ */
+class AT_CORE_API BaseStaticContext {
+ public:
+  virtual ~BaseStaticContext() noexcept {}
+
+  virtual std::pair<void*, DeleterFnPtr> New(size_t nbytes) const = 0;
+
+  virtual std::unique_ptr<BaseContext> CreateContext() = 0;
+
+  virtual std::unique_ptr<BaseContext> CreateContext(
+      const caffe2::DeviceOption&) = 0;
+
+  virtual DeviceType GetDeviceType() = 0;
+
+  /*
+   * @brief: Sets the DeviceOption for argument `device` based on the
+   * current context and the a data pointer
+   */
+  virtual void ExtractDeviceOption(
+      caffe2::DeviceOption* device,
+      const void* /*data*/) = 0;
+};
+
+/**
+ * Virtual interface for the Context class in Caffe2.
+ *
+ * A Context defines all the necessities to run an operator on a specific
+ * device. Specific Context classes needs to implement all the pure virtual
+ * functions in the BaseContext class.
+ * TODO: add docs after this is finalized.
+ */
+class AT_CORE_API BaseContext {
+ public:
+  virtual ~BaseContext() noexcept {}
+
+  virtual BaseStaticContext* GetStaticContext() const = 0;
+
+  /* Sorry for the naming, will get rid of this in future diff */
+  virtual DeviceType GetDevicetype() const = 0;
+
+  virtual void SwitchToDevice(int /*stream_id*/) = 0;
+
+  inline void SwitchToDevice() {
+    SwitchToDevice(0);
+  }
+
+  virtual void WaitEvent(const caffe2::Event& ev) = 0;
+
+  virtual void Record(caffe2::Event* ev, const char* err_msg = nullptr)
+      const = 0;
+
+  virtual void FinishDeviceComputation() = 0;
+
+  // This used to be arbitrary cross-device copy, but it turns out everyone
+  // did direct CPU-X copy, so we just make three functions for it (to avoid
+  // double dispatch).  This will get obsoleted by C10. where copies
+  // will be proper operators (and get to rely on multiple dispatch there.)
+  virtual void CopyBytesSameDevice(
+      size_t nbytes,
+      const void* src,
+      void* dst) = 0;
+
+  virtual void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) = 0;
+
+  virtual void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) = 0;
+
+  virtual void CopyBytesToDevice(
+      size_t nbytes,
+      const void* src,
+      void* dst,
+      DeviceType type) {
+    if (type == DeviceType::CPU) {
+      CopyBytesToCPU(nbytes, src, dst);
+    } else if (type == GetDevicetype()) {
+      CopyBytesSameDevice(nbytes, src, dst);
+    } else {
+      AT_ERROR(
+          "CopyBytesToDevice can only copy to CPU or between same "
+          "device. Can't copy from: ",
+          GetDevicetype(),
+          " to",
+          type);
+    }
+  }
+
+  template <typename T>
+  inline void CopySameDevice(size_t n, const T* src, T* dst) {
+    static_assert(
+        std::is_fundamental<T>::value,
+        "CopySameDevice requires fundamental types");
+    CopyBytesSameDevice(
+        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
+  }
+
+  template <typename T>
+  inline void CopyFromCPU(size_t n, const T* src, T* dst) {
+    static_assert(
+        std::is_fundamental<T>::value,
+        "CopyFromCPU requires fundamental types");
+    CopyBytesFromCPU(
+        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
+  }
+
+  template <typename T>
+  inline void CopyToCPU(size_t n, const T* src, T* dst) {
+    static_assert(
+        std::is_fundamental<T>::value, "CopyToCPU requires fundamental types");
+    CopyBytesToCPU(
+        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
+  }
+
+  virtual bool SupportsNonFundamentalTypes() const {
+    return false;
+  }
+
+  inline void EnforceMetaCopyOK() {
+    AT_ASSERTM(
+        SupportsNonFundamentalTypes(), "Context requires fundamental types");
+  }
+
+  void CopyItemsSameDevice(
+      const caffe2::TypeMeta& meta,
+      size_t n,
+      const void* src,
+      void* dst) {
+    if (meta.copy()) {
+      EnforceMetaCopyOK();
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytesSameDevice(n * meta.itemsize(), src, dst);
+    }
+  }
+
+  void CopyItemsFromCPU(
+      const caffe2::TypeMeta& meta,
+      size_t n,
+      const void* src,
+      void* dst) {
+    if (meta.copy()) {
+      EnforceMetaCopyOK();
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytesFromCPU(n * meta.itemsize(), src, dst);
+    }
+  }
+
+  void CopyItemsToCPU(
+      const caffe2::TypeMeta& meta,
+      size_t n,
+      const void* src,
+      void* dst) {
+    if (meta.copy()) {
+      EnforceMetaCopyOK();
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytesToCPU(n * meta.itemsize(), src, dst);
+    }
+  }
+};
+
+} // namespace at
diff --git a/aten/src/ATen/cpu/vec256/functional.h b/aten/src/ATen/cpu/vec256/functional.h
index c5e4efba70624..cc2408d6ed846 100644
--- a/aten/src/ATen/cpu/vec256/functional.h
+++ b/aten/src/ATen/cpu/vec256/functional.h
@@ -68,8 +68,8 @@ template <typename scalar_t, typename MapOp, typename ReduceOp>
 inline scalar_t map2_reduce_all(
     const MapOp& map_fun,
     const ReduceOp& red_fun,
-    scalar_t* data,
-    scalar_t* data2,
+    const scalar_t* data,
+    const scalar_t* data2,
     int64_t size) {
   using Vec = vec256::Vec256<scalar_t>;
   if (size < Vec::size) {
diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
index 80cc617fad526..fa3c4e550d6a3 100644
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -166,6 +166,13 @@ struct Vec256 {
   Vec256<T> rsqrt() const {
     return map([](T x) { return 1 / std::sqrt(x); });
   }
+  Vec256<T> pow(const Vec256<T> &exp) const {
+    Vec256<T> ret;
+    for (int64_t i = 0; i < size; i++) {
+      ret[i] = std::pow(values[i], exp[i]);
+    }
+    return ret;
+  }
 };
 
 template <class T> Vec256<T> operator+(const Vec256<T> &a, const Vec256<T> &b) {
@@ -208,6 +215,14 @@ template <class T> Vec256<T> max(const Vec256<T> &a, const Vec256<T> &b) {
   return c;
 }
 
+template <class T> Vec256<T> min(const Vec256<T> &a, const Vec256<T> &b) {
+  Vec256<T> c = Vec256<T>();
+  for (int i = 0; i != Vec256<T>::size; i++) {
+    c[i] = std::min(a[i], b[i]);
+  }
+  return c;
+}
+
 template <typename T>
 T fmadd(const T& a, const T& b, const T& c) {
   return a * b + c;
diff --git a/aten/src/ATen/cpu/vec256/vec256_double.h b/aten/src/ATen/cpu/vec256/vec256_double.h
index de9d441ce2471..05c2163465975 100644
--- a/aten/src/ATen/cpu/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_double.h
@@ -151,6 +151,9 @@ template <> class Vec256<double> {
   Vec256<double> rsqrt() const {
     return _mm256_div_pd(_mm256_set1_pd(1), _mm256_sqrt_pd(values));
   }
+  Vec256<double> pow(const Vec256<double> &b) const {
+    return Vec256<double>(Sleef_powd4_u10(values, b));
+  }
 };
 
 template <>
@@ -178,6 +181,11 @@ Vec256<double> inline max(const Vec256<double>& a, const Vec256<double>& b) {
   return _mm256_max_pd(a, b);
 }
 
+template <>
+Vec256<double> inline min(const Vec256<double>& a, const Vec256<double>& b) {
+  return _mm256_min_pd(a, b);
+}
+
 #ifdef __AVX2__
 template <>
 Vec256<double> fmadd(const Vec256<double>& a, const Vec256<double>& b, const Vec256<double>& c) {
diff --git a/aten/src/ATen/cpu/vec256/vec256_float.h b/aten/src/ATen/cpu/vec256/vec256_float.h
index e0f87362a9609..c38fae11c2486 100644
--- a/aten/src/ATen/cpu/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_float.h
@@ -156,6 +156,9 @@ template <> class Vec256<float> {
   Vec256<float> rsqrt() const {
     return _mm256_div_ps(_mm256_set1_ps(1), _mm256_sqrt_ps(values));
   }
+  Vec256<float> pow(const Vec256<float> &b) const {
+    return Vec256<float>(Sleef_powf8_u10(values, b));
+  }
 };
 
 template <>
@@ -183,6 +186,11 @@ Vec256<float> inline max(const Vec256<float>& a, const Vec256<float>& b) {
   return _mm256_max_ps(a, b);
 }
 
+template <>
+Vec256<float> inline min(const Vec256<float>& a, const Vec256<float>& b) {
+  return _mm256_min_ps(a, b);
+}
+
 #ifdef __AVX2__
 template <>
 Vec256<float> fmadd(const Vec256<float>& a, const Vec256<float>& b, const Vec256<float>& c) {
diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
index 36615c968a62f..88981fcc4d16f 100644
--- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh
+++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
@@ -338,7 +338,7 @@ bool CUDA_tensor_apply2(at::Tensor a,
       HANDLE_CASE(TYPE, A, -1);             \
       break;                                \
   }                                         \
-}                                           
+}
 
 #define HANDLE_A_CASE(TYPE, A, B) {         \
   switch (A) {                              \
@@ -382,7 +382,7 @@ bool CUDA_tensor_apply2(at::Tensor a,
 
     /*
     Only instantiates the all 1D special case and the fallback all nD case for
-    large (64-bit indexed) tensors to reduce compilation time. 
+    large (64-bit indexed) tensors to reduce compilation time.
     */
     if (aInfo.dims == 1 && bInfo.dims == 1) {
       kernelPointwiseApply2<Op,
@@ -411,7 +411,7 @@ bool CUDA_tensor_apply2(at::Tensor a,
     // Ignore overlaps when copying back; if we use copy
     // instead, it will recursively try and invoke ourselves to make
     // oldA contiguous.
-    oldA._copy_ignoring_overlaps_(a);
+    at::_copy_ignoring_overlaps_(oldA, a);
     a = oldA;
   }
 
@@ -419,7 +419,7 @@ bool CUDA_tensor_apply2(at::Tensor a,
     // Ignore overlaps when copying back; if we use copy
     // instead, it will recursively try and invoke ourselves to make
     // oldB contiguous.
-    oldB._copy_ignoring_overlaps_(b);
+    at::_copy_ignoring_overlaps_(oldB, b);
     b = oldB;
   }
 
@@ -587,7 +587,7 @@ bool CUDA_tensor_apply3(at::Tensor a,
 
     /*
     Only instantiates the all 1D special case and the fallback all nD case for
-    large (64-bit indexed) tensors to reduce compilation time. 
+    large (64-bit indexed) tensors to reduce compilation time.
     */
     if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1) {
       kernelPointwiseApply3<Op,
@@ -620,7 +620,7 @@ bool CUDA_tensor_apply3(at::Tensor a,
     // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldA contiguous.
-    oldA._copy_ignoring_overlaps_(a);
+    at::_copy_ignoring_overlaps_(oldA, a);
     a = oldA;
   }
 
@@ -628,7 +628,7 @@ bool CUDA_tensor_apply3(at::Tensor a,
     // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldB contiguous.
-    oldB._copy_ignoring_overlaps_(b);
+    at::_copy_ignoring_overlaps_(oldB, b);
     b = oldB;
   }
 
@@ -636,7 +636,7 @@ bool CUDA_tensor_apply3(at::Tensor a,
     // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldC contiguous.
-    oldC._copy_ignoring_overlaps_(c);
+    at::_copy_ignoring_overlaps_(oldC, c);
     c = oldC;
   }
 
@@ -838,7 +838,7 @@ bool CUDA_tensor_apply4(at::Tensor a,
 
     /*
     Only instantiates the all 1D special case and the fallback all nD case for
-    large (64-bit indexed) tensors to reduce compilation time. 
+    large (64-bit indexed) tensors to reduce compilation time.
     */
     if (aInfo.dims == 1 && bInfo.dims == 1 && cInfo.dims == 1 && dInfo.dims == 1) {
       kernelPointwiseApply4<Op,
@@ -874,7 +874,7 @@ bool CUDA_tensor_apply4(at::Tensor a,
     // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldA contiguous.
-    oldA._copy_ignoring_overlaps_(a);
+    at::_copy_ignoring_overlaps_(oldA, a);
     a = oldA;
   }
 
@@ -882,7 +882,7 @@ bool CUDA_tensor_apply4(at::Tensor a,
     // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldB contiguous.
-    oldB._copy_ignoring_overlaps_(b);
+    at::_copy_ignoring_overlaps_(oldB, b);
     b = oldB;
   }
 
@@ -890,7 +890,7 @@ bool CUDA_tensor_apply4(at::Tensor a,
     // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldC contiguous.
-    oldC._copy_ignoring_overlaps_(c);
+    at::_copy_ignoring_overlaps_(oldC, c);
     c = oldC;
   }
 
@@ -898,7 +898,7 @@ bool CUDA_tensor_apply4(at::Tensor a,
     // Ignore overlaps when copying back; if we use THCTensor_copy
     // instead, it will recursively try and invoke ourselves to make
     // oldC contiguous.
-    oldD._copy_ignoring_overlaps_(c);
+    at::_copy_ignoring_overlaps_(oldD, c);
     d = oldD;
   }
 
diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index e30162d81eb21..7f934ef2cb049 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -16,6 +16,10 @@ int64_t current_device() {
   return cur_device;
 }
 
+void set_device(int64_t device) {
+  AT_CUDA_CHECK(cudaSetDevice((int)device));
+}
+
 cudaDeviceProp* getCurrentDeviceProperties() {
   return THCState_getCurrentDeviceProperties(at::globalContext().getTHCState());
 }
diff --git a/aten/src/ATen/cuda/CUDAContext.h b/aten/src/ATen/cuda/CUDAContext.h
index fee250e6d935d..3a75483aa3e70 100644
--- a/aten/src/ATen/cuda/CUDAContext.h
+++ b/aten/src/ATen/cuda/CUDAContext.h
@@ -39,6 +39,8 @@ AT_API int64_t getNumGPUs();
 
 AT_API int64_t current_device();
 
+AT_API void set_device(int64_t device);
+
 AT_API cudaDeviceProp* getCurrentDeviceProperties();
 
 AT_API cudaDeviceProp* getDeviceProperties(int64_t device);
diff --git a/aten/src/ATen/cuda/CUDAEvent.cpp b/aten/src/ATen/cuda/CUDAEvent.cpp
deleted file mode 100644
index ab6c8421816ce..0000000000000
--- a/aten/src/ATen/cuda/CUDAEvent.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-#include "ATen/cuda/CUDAEvent.h"
-#include "ATen/cuda/CUDAContext.h"
-#include "ATen/cuda/CUDAStream.h"
-#include "ATen/cuda/Exceptions.h"
-#include "ATen/core/Error.h"
-
-#include <mutex>
-#include <atomic>
-
-// Internal implementation is entirely hidden
-struct CUDAEventInternals {
-  std::atomic<int> refcount;
-  int64_t device; // Note: cudaGetDevice works with int32_t, not int64_t
-  cudaEvent_t event;
-};
-
-namespace at {
-namespace cuda {
-
-namespace detail {
-
-/*
-* Pointer-based event API
-*/
-CUDAEventInternals* CUDAEvent_create(unsigned int flags) {
-  std::unique_ptr<CUDAEventInternals> internals { new CUDAEventInternals() };
-  internals->refcount = 1;
-  internals->device = current_device();
-  AT_CUDA_CHECK(cudaEventCreateWithFlags(&internals->event, flags));
-  return internals.release();
-}
-
-void CUDAEvent_retain(CUDAEventInternals* internals) {
-  internals->refcount++;
-}
-
-void CUDAEvent_uncheckedFree(CUDAEventInternals* internals) {
-  if (--internals->refcount) {
-    cudaEventDestroy(internals->event);
-  }
-}
-cudaEvent_t CUDAEvent_event(CUDAEventInternals* internals) {
-  return internals->event;
-}
-
-int64_t CUDAEvent_device(CUDAEventInternals* internals) {
-  return internals->device;
-}
-
-void CUDAEvent_record(CUDAEventInternals* internals, const CUDAStream& stream) {
-  AT_CUDA_CHECK(cudaEventRecord(internals->event, stream));
-}
-
-} // namespace detail
-
-void CUDAEvent::record() const {
-  record(getCurrentCUDAStream());
-}
-
-void CUDAEvent::record(const CUDAStream& stream) const {
-  detail::CUDAEvent_record(internals_, stream);
-}
-
-
-} // namespace cuda
-} // namespace at
diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h
index 7a711bf2b2f2d..04aba2703c96a 100644
--- a/aten/src/ATen/cuda/CUDAEvent.h
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@@ -1,78 +1,116 @@
 #pragma once
 
-#include <cstdint>
-#include <utility>
+#include "ATen/cuda/ATenCUDAGeneral.h"
+#include "ATen/cuda/CUDAStream.h"
+#include "ATen/cuda/CUDAContext.h"
+#include "ATen/cuda/Exceptions.h"
+#include "ATen/core/Error.h"
+#include "ATen/DeviceGuard.h"
 
 #include "cuda_runtime_api.h"
 
-#include <ATen/core/ATenGeneral.h>
-#include <ATen/Error.h>
+#include <cstdint>
+#include <utility>
+
+namespace at { namespace cuda {
 
 /*
-* A CUDA event interface with no CUDA build dependency.
+* CUDAEvents are movable not copyable wrappers around CUDA's events.
 *
-* Includes the CUDAEvent RAII class and a pointer-based event API.
+* CUDAEvents are constructed lazily when recorded on streams. The events
+* have a device, and this device is acquired from the first recording stream.
+* Later streams that record to the event must share this device, but streams
+* on any device can wait on the event.
 */
-
-struct CUDAEventInternals;
-
-namespace at {
-namespace cuda {
-
-struct CUDAStream;
-
-namespace detail {
-
-// Pointer-based API (for internal use)
-// Note: ATen/Context is preferred to work with streams safely
-AT_API CUDAEventInternals* CUDAEvent_create(unsigned int flags);
-AT_API void CUDAEvent_retain(CUDAEventInternals* internals);
-AT_API void CUDAEvent_uncheckedFree(CUDAEventInternals* internals);
-AT_API cudaEvent_t CUDAEvent_event(CUDAEventInternals* internals);
-AT_API int64_t CUDAEvent_device(CUDAEventInternals* internals);
-
-} // namespace detail
-
-struct CUDAEvent {
+struct AT_CUDA_API CUDAEvent {
   // Constants
   static constexpr unsigned int DEFAULT_FLAGS = cudaEventDisableTiming;
 
   // Constructors
-  CUDAEvent(unsigned int flags = DEFAULT_FLAGS)
-    : internals_(detail::CUDAEvent_create(flags)) {}
-
-  ~CUDAEvent() { detail::CUDAEvent_uncheckedFree(internals_); }
-
-  CUDAEvent(const CUDAEvent& other) {
-    detail::CUDAEvent_retain(other.internals_);
-    internals_ = other.internals_;
+  CUDAEvent(unsigned int flags = DEFAULT_FLAGS) 
+  : flags_{flags} { }
+
+  // Note: event destruction done on creating device to avoid creating a 
+  // CUDA context on other devices.
+  ~CUDAEvent() { 
+    try {
+      if (is_created_) {
+        at::DeviceGuard device_guard{(int)device_};
+        cudaEventDestroy(event_);
+      }
+    } catch (...) { /* No throw */ }
   }
 
-  CUDAEvent(CUDAEvent&& other) {
-    std::swap(internals_, other.internals_);
-  }
+  CUDAEvent(const CUDAEvent&) = delete;
+  CUDAEvent& operator=(const CUDAEvent&) = delete;
 
-  CUDAEvent& operator=(CUDAEvent other) noexcept {
-    std::swap(internals_, other.internals_);
+  CUDAEvent(CUDAEvent&& other) { moveHelper(std::move(other)); } 
+  CUDAEvent& operator=(CUDAEvent&& other) {
+    moveHelper(std::move(other));
     return *this;
   }
 
-  operator cudaEvent_t() const { return detail::CUDAEvent_event(internals_); }
+  operator cudaEvent_t() const { return event(); }
 
   // Less than operator (to allow use in sets)
   friend bool operator<(const CUDAEvent& left, const CUDAEvent& right) {
-    return left.internals_ < right.internals_;
+    return left.event_ < right.event_;
   }
 
-  int64_t device() const { return detail::CUDAEvent_device(internals_); }
-  cudaEvent_t event() const { return detail::CUDAEvent_event(internals_); }
-  CUDAEventInternals* internals() const { return internals_; }
+  bool isCreated() const { return is_created_; }
+  int64_t device() const { return device_; }
+  cudaEvent_t event() const { return event_; }
+  
+  bool happened() const { 
+    return (was_recorded_ && cudaEventQuery(event_) == cudaSuccess);
+  }
+
+  void record() { record(getCurrentCUDAStream()); }
+  
+  void recordOnce(const CUDAStream& stream) { 
+    if (!was_recorded_) record(stream);
+  }
+  
+  void record(const CUDAStream& stream) {
+    if (is_created_) {
+      AT_ASSERT(device_ == stream.device());
+    } else {
+      create(stream.device());
+    }
+
+    AT_CUDA_CHECK(cudaEventRecord(event_, stream));
+    was_recorded_ = true;
+  }
 
-  void record() const; // Record on the current stream
-  void record(const CUDAStream& stream) const;
+  void block (const CUDAStream& stream) {
+    if (is_created_) {
+      AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_, 0));
+    }
+  }
+  
 
 private:
-  CUDAEventInternals* internals_;
+  unsigned int flags_ = DEFAULT_FLAGS;
+  bool is_created_ = false;
+  bool was_recorded_ = false;
+  int64_t device_ = -1;
+  cudaEvent_t event_;
+
+  void moveHelper(CUDAEvent&& other) {
+    std::swap(flags_, other.flags_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(was_recorded_, other.was_recorded_);
+    std::swap(device_, other.device_);
+    std::swap(event_, other.event_);
+  }
+
+  void create(const int64_t device) {
+    at::DeviceGuard device_guard{(int)device};
+    AT_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_));
+
+    is_created_ = true;
+    device_ = device;
+  }
 };
 
 } // namespace cuda
diff --git a/aten/src/ATen/cuda/CUDAHalf.cu b/aten/src/ATen/cuda/CUDAHalf.cu
index a552b44b4adff..bd121250ee484 100644
--- a/aten/src/ATen/cuda/CUDAHalf.cu
+++ b/aten/src/ATen/cuda/CUDAHalf.cu
@@ -7,36 +7,31 @@
 
 namespace at {
 #if CUDA_VERSION < 9000 && !defined(__HIP_PLATFORM_HCC__)
-template <> AT_CUDA_API
-half convert(Half aten_half) {
+
+half Converter<half, Half>::operator()(Half aten_half) {
   return half{aten_half.x};
 }
 
-template <> AT_CUDA_API
-half convert(double value) {
+half Converter<half, double>::operator()(double value) {
   return half{Half(value).x};
 }
 
-template <> AT_CUDA_API
-Half convert(half cuda_half) {
+Half Converter<Half, half>::operator()(half cuda_half) {
   return Half(cuda_half.x, Half::from_bits);
 }
 #else
-template <> AT_CUDA_API
-half convert(Half aten_half) {
+half Converter<half, Half>::operator()(Half aten_half) {
   __half_raw x_raw;
   x_raw.x = aten_half.x;
   return half(x_raw);
 }
 
-template <> AT_CUDA_API
-Half convert(half cuda_half) {
+Half Converter<Half, half>::operator()(half cuda_half) {
   __half_raw raw(cuda_half);
   return Half(raw.x, Half::from_bits);
 }
 
-template <> AT_CUDA_API
-half convert(double value) {
+half Converter<half, double>::operator()(double value) {
   __half_raw raw;
   raw.x = Half(value).x;
   return half {raw};
diff --git a/aten/src/ATen/cuda/CUDAHalf.cuh b/aten/src/ATen/cuda/CUDAHalf.cuh
index 034ce27ceee5b..6558ed518ac1f 100644
--- a/aten/src/ATen/cuda/CUDAHalf.cuh
+++ b/aten/src/ATen/cuda/CUDAHalf.cuh
@@ -8,9 +8,22 @@
 #include <cuda_fp16.h>
 
 namespace at {
-template <> AT_CUDA_API half convert(Half aten_half);
-template <> AT_CUDA_API Half convert(half cuda_half);
-template <> AT_CUDA_API half convert(double value);
+
+template <>
+struct AT_CUDA_API Converter<half, Half> {
+  half operator()(Half);
+};
+
+template <>
+struct AT_CUDA_API Converter<Half, half> {
+  Half operator()(half);
+};
+
+template <>
+struct AT_CUDA_API Converter<half, double> {
+  half operator()(double);
+};
+
 #if CUDA_VERSION >= 9000 || defined(__HIP_PLATFORM_HCC__)
 template <> __half HalfFix(Half h);
 template <> Half HalfFix(__half h);
diff --git a/aten/src/ATen/cuda/CUDAStream.cpp b/aten/src/ATen/cuda/CUDAStream.cpp
index 2cf29c9202386..7a5ff1972afb7 100644
--- a/aten/src/ATen/cuda/CUDAStream.cpp
+++ b/aten/src/ATen/cuda/CUDAStream.cpp
@@ -209,7 +209,8 @@ int64_t CUDAStream_device(CUDAStreamInternals* ptr) {
 }
 
 void CUDAStream_synchronize_with(CUDAStreamInternals* ptr, const CUDAEvent& event) {
-    AT_CUDA_CHECK(cudaStreamWaitEvent(ptr->stream, event, 0));
+    if (event.isCreated())
+      AT_CUDA_CHECK(cudaStreamWaitEvent(ptr->stream, event, 0));
 }
 
 } // namespace detail
diff --git a/aten/src/ATen/cuda/CUDAStream.h b/aten/src/ATen/cuda/CUDAStream.h
index 6802143d8f28c..cd739222037c9 100644
--- a/aten/src/ATen/cuda/CUDAStream.h
+++ b/aten/src/ATen/cuda/CUDAStream.h
@@ -77,7 +77,7 @@ AT_CUDA_API int64_t CUDAStream_device(CUDAStreamInternals*);
 
 // RAII for a CUDA stream
 // Allows use as a cudaStream_t, copying, moving, and metadata access.
-struct CUDAStream {
+struct AT_CUDA_API CUDAStream {
 
   // Constructors
   CUDAStream() = default;
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index cccf6dc28453d..4e60ee1597cc4 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -2,7 +2,7 @@
 
 #include <ATen/Allocator.h>
 #include <ATen/core/Generator.h>
-#include <ATen/Registry.h>
+#include <ATen/core/Registry.h>
 #include <ATen/core/Error.h>
 
 #include <cstddef>
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index ac7b212199e35..e87a7bb88f8eb 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -331,17 +331,17 @@ def __init__(self, reason):
 
 ALLOC_NOARGS_WRAP = {
     'THTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
-                 '(${Backend}TensorId(), ScalarType::${ScalarName}, false).release()',
+                 '(${Backend}TensorId(), ScalarType::${ScalarName}, allocator(), false).release()',
     'THBoolTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
-                     '(${Backend}TensorId(), ScalarType::Byte, false).release()',
+                     '(${Backend}TensorId(), ScalarType::Byte, allocator(), false).release()',
     'THIndexTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
-                      '(${Backend}TensorId(), ScalarType::Long, false).release()',
+                      '(${Backend}TensorId(), ScalarType::Long, allocator(), false).release()',
     'THIntegerTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
-                        '(${Backend}TensorId(), ScalarType::Int, false).release()',
+                        '(${Backend}TensorId(), ScalarType::Int, allocator(), false).release()',
     'THDenseTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
-                      '(${Backend}TensorId(), ScalarType::${ScalarName}, false).release()',
+                      '(${Backend}TensorId(), ScalarType::${ScalarName}, allocator(), false).release()',
     'THDenseIndexTensor*': 'c10::make_intrusive<TensorImpl, UndefinedTensor>'
-                           '(${Backend}TensorId(), ScalarType::Long, false).release()'
+                           '(${Backend}TensorId(), ScalarType::Long, allocator(), false).release()'
 }
 
 ALLOC_WRAP = {
@@ -557,6 +557,17 @@ def is_mutable_formal_argument(argument, option):
     return argument.get('output') or option['inplace'] and argument['name'] == 'self'
 
 
+def check_methods_do_not_start_with_underscore(name, is_method):
+    if name in {'_local_scalar', '_values', '_indices', '_nnz', '_sparseDims', '_denseDims'}:
+        return
+    if is_method and name.startswith('_') and not name.startswith('__'):
+        message = "Function '{}' starts with a single underscore and is ".format(name)
+        message += "configured to have a method on Tensor. Functions that start with "
+        message += " a single underscore should only be functions in the at:: "
+        message += "namespace and not methods on Tensor!"
+        raise RuntimeError(message)
+
+
 def to_return_type(arg, option):
     # type: (THFormal, FunctionOption) -> ReturnType
     t = arg['type']
@@ -810,6 +821,8 @@ def process_option(option, output_options):
         dispatch_tensor = find_dispatch_tensor(formals)
         is_namespace_function = is_function and dispatch_tensor is not None
 
+        check_methods_do_not_start_with_underscore(option['name'], is_method)
+
         broadcast_arg = get_broadcast_argument(option)
         # "s_" for "same size".
         option['method_prefix_derived'] = '' if broadcast_arg is None else 's_'
@@ -1032,7 +1045,7 @@ def find_formal(formal_name, formals):
             option['return_type'] == 'Tensor' and option['deprecated']
         needs_native_definition = not is_deprecated_factory_method
 
-        has_dispatch = dispatch_tensor or dispatch_type
+        check_methods_do_not_start_with_underscore(option['name'], is_method)
 
         option['method_prefix_derived'] = ''
         option['device_guard_declaration'] = device_guard(option, formals, is_factory_method)
@@ -1250,7 +1263,7 @@ def handle_zero_dim(env, option):
         if broadcasts_arg:
             return []
         zero_dim_actuals = [arg['name']
-                            if arg['name'] != zero_dim_dispatch else "{}._local_scalar()".format(arg['name'])
+                            if arg['name'] != zero_dim_dispatch else "at::_local_scalar({})".format(arg['name'])
                             for arg in option['formals_list']]
         return [ZERO_DIM_CHECK.substitute(env, check_name=zero_dim_dispatch, zero_dim_actuals=zero_dim_actuals)]
 
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index 762262c25ef19..3f962961f5581 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -124,10 +124,7 @@ def check_all_files_written(self):
 NATIVE_FUNCTIONS_H = CodeTemplate.from_file(TEMPLATE_PATH + "/NativeFunctions.h")
 
 TYPE_REGISTER = CodeTemplate("""\
-context->type_registry[static_cast<int>(Backend::${backend})]
-                      [static_cast<int>(ScalarType::${scalar_type})]
-                      .reset(new ${type_name}());
-detail::getVariableHooks().registerVariableTypeFor(context, Backend::${backend}, ScalarType::${scalar_type});
+context->registerType(Backend::${backend}, ScalarType::${scalar_type}, new ${type_name}());
 """)
 
 file_manager = FileManager()
diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp
index 843ebb67007ad..08f306869d89f 100644
--- a/aten/src/ATen/native/Distance.cpp
+++ b/aten/src/ATen/native/Distance.cpp
@@ -1,10 +1,14 @@
-#include "ATen/ATen.h"
-#include "ATen/Dispatch.h"
-#include "ATen/NativeFunctions.h"
-#include "DistanceOpsKernel.h"
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/NativeFunctions.h>
+
+#include <ATen/native/Distance.h>
 
 namespace at { namespace native {
 
+DEFINE_DISPATCH(pdist_forward_stub);
+DEFINE_DISPATCH(pdist_backward_stub);
+
 Tensor pairwise_distance(const Tensor& x1, const Tensor& x2, double p, double eps, bool keepdim) {
   return at::norm(x1 - x2 + eps, p, 1, keepdim);
 }
@@ -20,6 +24,8 @@ Tensor pdist(const Tensor& self, const double p) {
 
 Tensor _pdist_forward(const Tensor& self, const double p) {
   AT_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input");
+  auto device = self.type().device_type();
+  AT_CHECK(device == kCPU || device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device);
   Tensor result = self.type().tensor();
   if (self.size(0) <= 1) {
     result.resize_({0});
@@ -29,10 +35,8 @@ Tensor _pdist_forward(const Tensor& self, const double p) {
     result.resize_({c});
     if (self.size(1) == 0) {
       result.fill_(0);
-    } else if (self.type().backend() == Backend::CPU) {
-      pdist_kernel_cpu(result, self, p);
     } else {
-      AT_ERROR("pdist only supports CPU backend, got: ", at::toString(self.type().backend()));
+      pdist_forward_stub(device, result, self, p);
     }
   }
   return result;
@@ -41,12 +45,10 @@ Tensor _pdist_forward(const Tensor& self, const double p) {
 Tensor _pdist_backward(const Tensor& grad, const Tensor& self, const double p, const Tensor& pdist) {
   AT_CHECK(self.is_contiguous(), "_pdist_backward requires self to be contiguous");
   AT_CHECK(pdist.is_contiguous(), "_pdist_backward requires pdist to be contiguous");
+  auto device = self.type().device_type();
+  AT_CHECK(device == kCPU || device == kCUDA, "_pdist_backward only supports CPU and CUDA devices, got: ", device);
   Tensor result = at::empty_like(self);
-  if (self.type().backend() == Backend::CPU) {
-    pdist_backward_kernel_cpu(result, grad, self, p, pdist);
-  } else {
-    AT_ERROR("pdist_backward only supports CPU backend, got: ", at::toString(self.type().backend()));
-  }
+  pdist_backward_stub(device, result, grad, self, p, pdist);
   return result;
 }
 
diff --git a/aten/src/ATen/native/Distance.h b/aten/src/ATen/native/Distance.h
new file mode 100644
index 0000000000000..87cdc623c3701
--- /dev/null
+++ b/aten/src/ATen/native/Distance.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at { namespace native {
+
+using pdist_forward_fn = void(*)(Tensor&, const Tensor&, const double p);
+using pdist_backward_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const double p, const Tensor&);
+
+DECLARE_DISPATCH(pdist_forward_fn, pdist_forward_stub);
+DECLARE_DISPATCH(pdist_backward_fn, pdist_backward_stub);
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/DistanceOpsKernel.cpp b/aten/src/ATen/native/DistanceOpsKernel.cpp
deleted file mode 100644
index 53f3b856e3a69..0000000000000
--- a/aten/src/ATen/native/DistanceOpsKernel.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-#include "DistanceOpsKernel.h"
-
-#include <numeric>
-#include <iterator>
-#include <algorithm>
-
-#include "ATen/Dispatch.h"
-#include "ATen/Parallel.h"
-
-namespace at { namespace native { namespace {
-
-template<typename scalar_t>
-struct PDist {
-
-  static inline scalar_t sign(scalar_t val) {
-    return (0 < val) - (val < 0);
-  }
-
-  // Zero norm
-  struct zdist_calc {
-    static inline void inc(scalar_t& agg, const scalar_t diff, const scalar_t p) { agg += diff != 0.0; }
-    static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; }
-  };
-
-  // One norm
-  struct odist_calc {
-    static inline void inc(scalar_t& agg, const scalar_t diff, const scalar_t p) { agg += diff; }
-    static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; }
-    static inline scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return grad * sign(diff); }
-  };
-
-  // Special general pnorm derivative if p is less than two
-  struct lttdist_calc {
-    static inline scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return dist == 0.0 ? 0 : sign(diff) * std::pow(std::abs(diff), p - 1) * grad / std::pow(dist, p - 1); }
-  };
-
-  // Two norm
-  struct tdist_calc {
-    static inline void inc(scalar_t& agg, const scalar_t diff, const scalar_t p) { agg += diff * diff; }
-    static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return std::sqrt(agg); }
-    static inline scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return dist == 0.0 ? 0 : grad * diff / dist; }
-  };
-
-  // General p norm
-  struct pdist_calc {
-    static inline void inc(scalar_t& agg, const scalar_t diff, const scalar_t p) { agg += std::pow(diff, p); }
-    static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return std::pow(agg, 1.0 / p); }
-    static inline scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return dist == 0.0 ? 0 : diff * std::pow(std::abs(diff), p - 2) * grad / std::pow(dist, p - 1); }
-  };
-
-  // Info norm
-  struct idist_calc {
-    static inline void inc(scalar_t& agg, const scalar_t diff, const scalar_t p) { agg = std::max(agg, diff); }
-    static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; }
-    static inline scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return grad * sign(diff) * (std::abs(diff) == dist); }
-  };
-
-  template <typename F>
-  static void run_parallel(Tensor& result, const Tensor& self, const scalar_t p) {
-    auto res_ = result.data<scalar_t>();
-    auto self_ = self.data<scalar_t>();
-    int64_t n = self.size(0);
-    int64_t m = self.size(1);
-
-    int64_t combs = n * (n - 1) / 2;
-    parallel_for(0, combs, 1, [=](int64_t k, int64_t end) {
-      float n2 = n - .5;
-      // The -1 accounts for floating point truncation issues
-      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
-      int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
-      for (; k < end; ++k) {
-        const scalar_t * a = self_ + i * m;
-        const scalar_t * b = self_ + j * m;
-        const scalar_t * const stop = a + m;
-        scalar_t agg = 0.0;
-        for (; a != stop; ++a, ++b) {
-          F::inc(agg, std::abs(*a - *b), p);
-        }
-        res_[k] = F::finish(agg, p);
-
-        ++j;
-        if (j == n) {
-          ++i;
-          j = i + 1;
-        }
-      }
-    });
-  }
-
-  // Assumes self is nonempty and 2D
-  static void apply(Tensor& result, const Tensor& self, const scalar_t p) {
-    if (p == 0.0) {
-      run_parallel<zdist_calc>(result, self, p);
-    } else if (p == 1.0) {
-      run_parallel<odist_calc>(result, self, p);
-    } else if (p == 2.0) {
-      run_parallel<tdist_calc>(result, self, p);
-    } else if (std::isinf(p)) {
-      run_parallel<idist_calc>(result, self, p);
-    } else {
-      run_parallel<pdist_calc>(result, self, p);
-    }
-  }
-
-  template <typename F>
-  static void run_backward_parallel(Tensor& result, const Tensor & grad, const Tensor & self, const scalar_t p, const Tensor& dist) {
-    const int64_t n = self.size(0);
-    const int64_t m = self.size(1);
-    const int64_t gs = grad.stride(0);
-
-    const scalar_t * const grad_ = grad.data<scalar_t>();
-    const scalar_t * const dist_ = dist.data<scalar_t>();
-    const scalar_t * const self_ = self.data<scalar_t>();
-    scalar_t * const res_ = result.data<scalar_t>();
-
-    // The only way to parallelize and avoid locking requires parallelizing over the columns :(
-    at::parallel_for(0, m, 1, [=](int64_t l, int64_t end) {
-      const scalar_t * self_l = self_ + l;
-      scalar_t * res_l = res_ + l;
-      for (; l != end; l += 1, self_l += 1, res_l += 1) {
-        const scalar_t * grad_k = grad_;
-        const scalar_t * dist_k = dist_;
-        const scalar_t * self_i = self_l;
-        scalar_t * res_i = res_l;
-        for (const scalar_t * const end_i = self_l + m * (n - 1); self_i != end_i; self_i += m, res_i += m) {
-          const scalar_t * self_j = self_i + m;
-          scalar_t * res_j = res_i + m;
-          for (const scalar_t * const end_j = self_l + m * n; self_j != end_j; self_j += m, res_j += m, grad_k += gs, dist_k += 1) {
-            const scalar_t res = F::backward(*self_i - *self_j, *grad_k, *dist_k, p);
-            *res_i += res;
-            *res_j -= res;
-          }
-        }
-      }
-    });
-  }
-
-  static void apply_backward(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
-    result.fill_(0);
-    if (p == 0.0) {
-    } else if (p == 1.0) {
-      run_backward_parallel<odist_calc>(result, grad, self, p, dist);
-    } else if (p < 2.0) {
-      run_backward_parallel<lttdist_calc>(result, grad, self, p, dist);
-    } else if (p == 2.0) {
-      run_backward_parallel<tdist_calc>(result, grad, self, p, dist);
-    } else if (std::isinf(p)) {
-      run_backward_parallel<idist_calc>(result, grad, self, p, dist);
-    } else {
-      run_backward_parallel<pdist_calc>(result, grad, self, p, dist);
-    }
-  }
-
-};
-
-}  // anonymous namespace
-
-void pdist_kernel_cpu(Tensor& result, const Tensor& self, double p) {
-  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist", [&] {
-    PDist<scalar_t>::apply(result, self, p);
-  });
-}
-
-void pdist_backward_kernel_cpu(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
-  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_backward", [&] {
-    PDist<scalar_t>::apply_backward(result, grad, self, p, dist);
-  });
-}
-
-}}  // namespace at::native
diff --git a/aten/src/ATen/native/DistanceOpsKernel.h b/aten/src/ATen/native/DistanceOpsKernel.h
deleted file mode 100644
index 1afd58e1be8e0..0000000000000
--- a/aten/src/ATen/native/DistanceOpsKernel.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#pragma once
-
-#include <ATen/ATen.h>
-#include <ATen/native/DispatchStub.h>
-
-namespace at { namespace native {
-
-void pdist_kernel_cpu(Tensor& result, const Tensor& self, double p);
-
-void pdist_backward_kernel_cpu(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist);
-
-}} // namespace at::native
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index 36845c7153b7b..ccc6d8a0c0409 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -145,7 +145,7 @@ Tensor& bernoulli_(Tensor& self, const Tensor& p_, Generator* gen) {
 }
 
 Tensor& bernoulli_(Tensor& self, double p, Generator* gen) {
-    self._bernoulli_(p, gen);
+    at::_bernoulli_(self, p, gen);
     return self;
 }
 
diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp
index efaa4a4b6f507..56a5afbdee57d 100644
--- a/aten/src/ATen/native/Dropout.cpp
+++ b/aten/src/ATen/native/Dropout.cpp
@@ -82,7 +82,7 @@ ALIAS_SPECIALIZATION(_feature_alpha_dropout, true,  true )
 
 Tensor dropout(const Tensor& input, double p, bool train) {
   if (train && is_fused_kernel_acceptable(input, p)) {
-    return std::get<0>(input._fused_dropout(1 - p));
+    return std::get<0>(at::_fused_dropout(input, 1 - p));
   }
   return _dropout<false>(input, p, train);
 }
diff --git a/aten/src/ATen/native/Gesv.cpp b/aten/src/ATen/native/Gesv.cpp
index 0e9a594764546..b45e2a4f98860 100644
--- a/aten/src/ATen/native/Gesv.cpp
+++ b/aten/src/ATen/native/Gesv.cpp
@@ -110,7 +110,7 @@ std::tuple<Tensor,Tensor> gesv(const Tensor& self, const Tensor& A) {
 
   Tensor self_broadcasted  = self.expand(self_expand_size);
   Tensor A_broadcasted = A.expand(A_expand_size);
-  return self.type()._gesv_helper(self_broadcasted, A_broadcasted);
+  return at::_gesv_helper(self_broadcasted, A_broadcasted);
 }
 
 std::tuple<Tensor&,Tensor&> gesv_out(
diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp
index e4eb336cd5f45..288fa283abe66 100644
--- a/aten/src/ATen/native/Indexing.cpp
+++ b/aten/src/ATen/native/Indexing.cpp
@@ -301,7 +301,7 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten
           "index_copy_(): Number of indices (", numIndices, ") should be equal to source.size(dim) (", source.size(dim), ")");
   }
 
-  return self._indexCopy_(dim, index, source);
+  return at::_indexCopy_(self, dim, index, source);
 }
 
 }} // at::native
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 796c644dc3355..2371d82efc6cf 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -196,7 +196,7 @@ Tensor addmv(const Tensor& self, const Tensor& mat, const Tensor& vec, Scalar be
 
 Tensor& addmv_(Tensor& self, const Tensor& mat, const Tensor& vec, Scalar beta, Scalar alpha) {
   check_1d(vec, "vec", "addmv");
-  return self._addmv_(mat, vec, beta, alpha);
+  return at::_addmv_(self, mat, vec, beta, alpha);
 }
 
 Tensor& addmv_out(Tensor &result, const Tensor& self, const Tensor& mat, const Tensor& vec, Scalar beta, Scalar alpha) {
@@ -213,7 +213,7 @@ Tensor addr(const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar b
 Tensor& addr_(Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
   check_1d(vec1, "vec1", "addr");
   check_1d(vec2, "vec2", "addr");
-  return self._addr_(vec1, vec2, beta, alpha);
+  return at::_addr_(self, vec1, vec2, beta, alpha);
 }
 
 Tensor& addr_out(Tensor &result, const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
@@ -225,7 +225,7 @@ Tensor& addr_out(Tensor &result, const Tensor& self, const Tensor& vec1, const T
 Tensor dot(const Tensor& self, const Tensor& tensor) {
   check_1d(self, "self", "dot");
   check_1d(tensor, "tensor", "dot");
-  return self._dot(tensor);
+  return at::_dot(self, tensor);
 }
 
 Tensor& dot_out(Tensor& result, const Tensor& self, const Tensor& tensor) {
@@ -347,5 +347,44 @@ Tensor& matmul_out(Tensor &result, const Tensor & tensor1, const Tensor & tensor
   return result;
 }
 
+Tensor matrix_power(const Tensor& a, int64_t n) {
+  AT_CHECK(a.dim() >= 2 && at::isFloatingType(a.type().scalarType()),
+           "matrix_power(", a.type(), "{", a.sizes(), "}): expected a tensor "
+           "of floating types with dim at least 2");
+  if (n == 0) {
+    return a.clone().copy_(at::eye(a.size(-2), a.options()).expand_as(a));
+  } else if (n < 0) {
+    AT_CHECK(a.dim() == 2, "Negative powers for batch matrices are currently not supported");
+    Tensor a_ = at::inverse(a);
+    n *= -1;
+    return at::native::matrix_power(a_, n);
+  } else if (n == 1) {
+    return a.clone();
+  } else if (n == 2) {
+    return at::native::matmul(a, a);
+  } else if (n == 3) {
+    return at::native::matmul(at::native::matmul(a, a), a);
+  }
+
+  // This is a binary decomposition of n.
+  // Moving from the least significant bit to the most significant bit
+  // This is done to reduce the number of matrix multiplications
+  // by raising the input matrix in powers of 2
+  // The total number of matrix multiplications are
+  // number of bits + number of bits that equal 1 ~ O(log n)
+  // instead of O(n)
+  Tensor result, z;
+  int64_t r;
+  while (n > 0) {
+    z = (!z.defined()) ? a.clone() : at::native::matmul(z, z);
+    r = n % 2;
+    n = n / 2;
+    if (r == 1) {
+      result = (!result.defined()) ? z.clone() : at::native::matmul(result, z);
+    }
+  }
+  return result;
 }
-}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index d4ad799948b0c..9b06a513a14cd 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -133,12 +133,11 @@ list.  For example, given the declaration `where(BoolTensor cond, Tensor self, T
 this generates the function `at::where(cond, self, other)` and the method
 `self.where(cond, other)`.
 
-By default, ATen generates both function and method variants for a native function.
-Generally, the function variant is always useful; however, you may not wish
-to generate a method variant. Tensor operations as methods are appropriate for "core"
-Tensor operations (e.g., add, sub, etc.), but not for more complicated neural network
-layers (e.g., `conv2d`) and internal functions designed specifically for binding
-(e.g., `cudnn_convolution`).
+By default, ATen generates only the function variant for a native function.
+When should you also generate a method variant?  Tensor operations as methods
+are appropriate for "core" Tensor operations (e.g., add, sub, etc.), but not for
+more complicated neural network layers (e.g., `conv2d`) and internal functions
+designed specifically for binding (e.g., `cudnn_convolution`).
 
 ### `dispatch`
 
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index 1f7331baa5e9d..2c7e641dcbe84 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -133,7 +133,7 @@ Tensor _sum_cpu(const Tensor& self) {
     sum_kernel(kCPU, result, self, at::nullopt);
     return result;
   }
-  return self._sumall();
+  return at::_sumall(self);
 }
 
 static inline Tensor prod(const Tensor &self, optional<ScalarType> dtype) {
@@ -154,7 +154,7 @@ Tensor _prod_cpu(const Tensor &self) {
     prod_kernel(kCPU, result, self, at::nullopt);
     return result;
   }
-  return self._prodall();
+  return at::_prodall(self);
 }
 
 // \ALL REDUCE ################################################################
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 5d1c883bee3f1..7283ed28b9d5e 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -24,29 +24,23 @@ static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim,
            const bool inverse, IntList signal_sizes, const bool normalized,
            const bool onesided) {
 
-  if (signal_ndim < 1 || signal_ndim > 3) {
-    std::ostringstream ss;
-    ss << "Expected signal_ndim to be 1, 2, or 3, but got signal_ndim="
-       << signal_ndim;
-    throw std::runtime_error(ss.str());
-  }
-  if (!at::isFloatingType(self.type().scalarType())) {
-    std::ostringstream ss;
-    ss << "Expected an input tensor of floating types, but got input="
-       << self.type() << self.sizes();
-    throw std::runtime_error(ss.str());
-  }
+  AT_CHECK(signal_ndim >= 1 && signal_ndim <= 3,
+           "Expected signal_ndim to be 1, 2, or 3, but got signal_ndim=",
+           signal_ndim);
+  AT_CHECK(at::isFloatingType(self.type().scalarType()),
+           "Expected an input tensor of floating types, but got input=",
+           self.type(), self.sizes());
 
   auto signal_tensor_ndim = signal_ndim + static_cast<int64_t>(complex_input);  // add complex dim
   if (self.dim() < signal_tensor_ndim) {
     std::ostringstream ss;
     ss << "Given signal_ndim=" << signal_ndim << ", expected an input tensor "
-       << "of at least" << signal_tensor_ndim << "D";
+       << "of at least " << signal_tensor_ndim << "D";
     if (complex_input) {
       ss << " (complex input adds an extra dimension)";
     }
     ss << ", but got input=" << self.type() << self.sizes();
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   }
 
   auto self_shape = self.sizes();
@@ -68,22 +62,16 @@ static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim,
   // now we assume that input is batched as [ B x signal_dims... ]
 
   if (complex_input) {
-    if (input.size(signal_ndim + 1) != 2) {
-      std::ostringstream ss;
-      ss << "Expected an input tensor with a last dimension of size 2 "
-         << "representing real + imaginary components, but got input "
-         << self.type() << self.sizes();
-      throw std::runtime_error(ss.str());
-    }
+    AT_CHECK(input.size(signal_ndim + 1) == 2,
+             "Expected an input tensor with a last dimension of size 2 "
+             "representing real + imaginary components, but got input ",
+             self.type(), self.sizes());
   }
 
   // build signal_sizes and output_size
-  if (signal_sizes.size() > 0 && static_cast<int64_t>(signal_sizes.size()) != signal_ndim) {
-    std::ostringstream ss;
-    ss << "Expected signal_sizes to be empty (default) or of signal_ndim="
-       << signal_ndim << "D, but got signal_sizes=" << signal_sizes;
-    throw std::runtime_error(ss.str());
-  }
+  AT_CHECK(signal_sizes.size() == 0 || static_cast<int64_t>(signal_sizes.size()) == signal_ndim,
+           "Expected signal_sizes to be empty (default) or of signal_ndim=",
+           signal_ndim, "D, but got signal_sizes=", signal_sizes);
   std::vector<int64_t> output_sizes(signal_ndim + 1 + static_cast<int64_t>(complex_output));
   output_sizes[0] = input.size(0);  // batch size
   std::vector<int64_t> checked_signal_sizes(signal_ndim);
@@ -110,14 +98,11 @@ static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim,
         output_sizes[i + 1] = input_size;
       }
       checked_signal_sizes[i] = input_size;
-      if (signal_sizes.size() > 0 && signal_sizes[i] != checked_signal_sizes[i]) {
-        std::ostringstream ss;
-        ss << "Expected given signal_sizes=" << signal_sizes << " to have same "
-           << "shape with input at signal dimension " << i << ", but got "
-           << "signal_sizes=" << signal_sizes << " and input=" << self.type()
-           << self.sizes();
-        throw std::runtime_error(ss.str());
-      }
+      AT_CHECK(signal_sizes.size() == 0 || signal_sizes[i] == checked_signal_sizes[i],
+               "Expected given signal_sizes=", signal_sizes," to have same "
+               "shape with input at signal dimension ", i, ", but got "
+               "signal_sizes=", signal_sizes, " and input=", self.type(),
+               self.sizes());
     }
   }
   if (complex_output) {
@@ -222,7 +207,7 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const int64_t hop_length,
   if (hop_length <= 0) {
     std::ostringstream ss;
     REPR(ss) << ": expected hop_length > 0, but got hop_length=" << hop_length;
-    throw std::runtime_error(ss.str());
+    AT_ERROR(ss.str());
   }
   if (win_length <= 0 || win_length > n_fft) {
     std::ostringstream ss;
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index c774652478509..40c4ce39addeb 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -65,7 +65,7 @@ bool is_nonzero(const Tensor& self) {
   if (n > 1) {
     AT_ERROR("bool value of Tensor with more than one value is ambiguous");
   }
-  Scalar localScalar = self._local_scalar();
+  Scalar localScalar = at::_local_scalar(self);
   if (localScalar.isFloatingPoint()) {
     return localScalar.to<double>() != 0;
   } else if (localScalar.isIntegral()){
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 748428db2b6ba..1a12549b5e70e 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -23,7 +23,7 @@
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // A number of factory functions are implemented in the following way:
 //
-//    return at::getMaybeVariableType(options)._arange(start, end, step);
+//    return at::getType(options)._arange(start, end, step);
 //
 // That is to say, they grab a Type for TensorOptions, and then call some
 // internal method.  What's going on?
@@ -33,7 +33,7 @@
 // (and never will) understand TensorOptions, so we need to handle TensorOptions
 // inside native before batting over to TH.  The expectation is that when
 // these factories get ported to native, this is no longer necessary,
-// and we can eliminate the getMaybeVariableType call.
+// and we can eliminate the getType call.
 
 namespace at {
 namespace native {
@@ -73,7 +73,7 @@ Tensor arange(
     Scalar step,
     const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getMaybeVariableType(options)._arange(start, end, step);
+  return at::getType(options)._arange(start, end, step);
 }
 
 Tensor& arange_out(Tensor& result, Scalar start, Scalar end) {
@@ -86,7 +86,7 @@ Tensor& arange_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
 
 Tensor arange(Scalar end, const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getMaybeVariableType(options)._arange(end);
+  return at::getType(options)._arange(end);
 }
 
 Tensor& arange_out(Tensor& result, Scalar end) {
@@ -102,7 +102,7 @@ Tensor _dim_arange(const Tensor& like, int64_t dim) {
 Tensor empty(IntList size, const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
   // Can't call a factory function, because the buck stops with us!
-  return at::getMaybeVariableType(options).tensor(size);
+  return at::getType(options).tensor(size);
 }
 
 Tensor& empty_out(Tensor& result, IntList size) {
@@ -139,7 +139,6 @@ Tensor empty_like(const Tensor& self, const TensorOptions& options) {
   if (options.layout() == kSparse && self.type().is_sparse()) {
     auto res = native::empty({0}, options); // to be resized
     res.sparse_resize_and_clear_(self.sizes(), self._sparseDims(), self._denseDims());
-
     return res;
   }
   return native::empty(self.sizes(), options);
@@ -219,7 +218,7 @@ Tensor linspace(
     int64_t steps,
     const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getMaybeVariableType(options)._linspace(start, end, steps);
+  return at::getType(options)._linspace(start, end, steps);
 }
 
 Tensor& linspace_out(Tensor& result, Scalar start, Scalar end) {
@@ -242,7 +241,7 @@ Tensor logspace(
     int64_t steps,
     const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getMaybeVariableType(options)._logspace(start, end, steps);
+  return at::getType(options)._logspace(start, end, steps);
 }
 
 Tensor& logspace_out(Tensor& result, Scalar start, Scalar end) {
@@ -476,7 +475,7 @@ Tensor range(
     Scalar step,
     const TensorOptions& options) {
   // Note [Native bindings for legacy TH factory functions]
-  return at::getMaybeVariableType(options)._range(start, end, step);
+  return at::getType(options)._range(start, end, step);
 }
 
 Tensor& range_out(Tensor& result, Scalar start, Scalar end) {
diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp
index a4b647d62a621..d326302c5b281 100644
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@@ -41,11 +41,14 @@ bool cudnn_is_acceptable(const Tensor& self) {
 }
 
 Tensor detach(const Tensor& self) {
-  return self.unsafeGetTensorImpl()->detach();
+  // this just exists to give us a hook in VariableType and an entry in Declarations.yaml
+  AT_ERROR("detach is not implemented for Tensor");
+  return self;
 }
 
 Tensor & detach_(Tensor & self) {
-  self.unsafeGetTensorImpl()->detach_();
+  // this just exists to give us a hook in VariableType and an entry in Declarations.yaml
+  AT_ERROR("detach_ is not implemented for Tensor");
   return self;
 }
 
diff --git a/aten/src/ATen/native/TypeProperties.cpp b/aten/src/ATen/native/TypeProperties.cpp
index a3c5f6882bf45..af81c02abc801 100644
--- a/aten/src/ATen/native/TypeProperties.cpp
+++ b/aten/src/ATen/native/TypeProperties.cpp
@@ -13,6 +13,10 @@ bool is_distributed(const Tensor& self) {
   return self.type().is_distributed();
 }
 
+bool is_complex(const Tensor& self) {
+  return at::isComplexType(self.type().scalarType());
+}
+
 bool is_floating_point(const Tensor& self) {
   return at::isFloatingType(self.type().scalarType());
 }
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 2d8a3e2da4c7c..89a13e14b8b2e 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -89,11 +89,11 @@ Tensor& _clamp_min_out_cpu(Tensor& result, const Tensor& self, Scalar min) {
 }
 
 Tensor& fill_(Tensor& self, Scalar value) {
-  return self._fill_(value);
+  return at::_fill_(self, value);
 }
 
 Tensor& fill_(Tensor& self, const Tensor& value) {
-  return self._fill_(value);
+  return at::_fill_(self, value);
 }
 
 Tensor mvlgamma(const Tensor& self, int64_t p) {
diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
new file mode 100644
index 0000000000000..efdb957d2fffb
--- /dev/null
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@@ -0,0 +1,240 @@
+#include <ATen/native/Distance.h>
+
+#include <numeric>
+#include <iterator>
+#include <algorithm>
+
+#include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
+#include <ATen/cpu/vml.h>
+
+namespace at { namespace native { namespace {
+
+template<typename scalar_t>
+struct PDist {
+  using Vec = vec256::Vec256<scalar_t>;
+
+  // Depending on the value of the pnorm, there are specific implementations
+  // that are much faster than std::pow(std::abs(a - b), p), but have the same
+  // standard loop code for how to process the input vector. To reuse the main
+  // outside loop while still guaranteeing that the compiler inlines every
+  // different function on p, we break the inner norm logic into structs with
+  // static functions that represent what's done differently, and template the
+  // outer loop on those structs.
+  //
+  // The four functions are:
+  //     map :      This tells how to modify (a - b) to form the component that
+  //                gets summed.
+  //     red :      This tells how to sum the result of map up. This is
+  //                separate because the inf norm actuall uses max instead of
+  //                sum.
+  //     finish :   This tells what to do with the aggregated value to compute
+  //                the norm. Generally this is the result of val ^ (1 / p).
+  //     backward : This is the gradient for that norm. Arguments are pretty
+  //                self explanitory.
+  //
+  // There are a few cases where these aren't used. The 0 norm has no backward,
+  // because it's always 0, so that's shortcircuited earlier. There's a special
+  // implementation of the general backward pass when p is less than two, so
+  // there's a struct with only a backward pass for this case.
+
+  // TODO This is an inefficient way to compite sign, and can be much faster
+  // using native SSE instructions that should be added to Vec256.
+  static inline Vec sign(Vec val) {
+    return vec256::min(vec256::max(Vec(0), val.ceil()), Vec(1)) +
+      vec256::min(vec256::max(Vec(-1), val.floor()), Vec(0));
+  }
+
+  // Zero norm
+  struct zdist_calc {
+    static inline Vec map(const Vec& diff, const Vec& p) { return vec256::min(diff.abs().ceil(), Vec(1)); }
+    static inline Vec red(const Vec& agg, const Vec& up) { return agg + up; }
+    static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; }
+  };
+
+  // One norm
+  struct odist_calc {
+    static inline Vec map(const Vec& diff, const Vec& p) { return diff; }
+    static inline Vec red(const Vec& agg, const Vec& up) { return agg + up; }
+    static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; }
+    static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return Vec(grad) * sign(diff); }
+  };
+
+  // Special general pnorm derivative if p is less than two
+  struct lttdist_calc {
+    static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return dist == 0.0 ? Vec(0) : sign(diff) * diff.abs().pow(p - Vec(1)) * Vec(grad) / Vec(dist).pow(p - Vec(1)); }
+  };
+
+  // Two norm
+  struct tdist_calc {
+    // TODO This can probably use fused add multiply to get better perf
+    static inline Vec map(const Vec& diff, const Vec& p) { return diff * diff; }
+    static inline Vec red(const Vec& agg, const Vec& up) { return agg + up; }
+    static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return std::sqrt(agg); }
+    static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return dist == 0.0 ? Vec(0) : Vec(grad) * diff / Vec(dist); }
+  };
+
+  // General p norm
+  struct pdist_calc {
+    static inline Vec map(const Vec& diff, const Vec& p) { return diff.pow(p); }
+    static inline Vec red(const Vec& agg, const Vec& up) { return agg + up; }
+    static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return std::pow(agg, 1.0 / p); }
+    static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return dist == 0.0 ? Vec(0) : diff * diff.abs().pow(p - Vec(2)) * Vec(grad) / Vec(dist).pow(p - Vec(1)); }
+  };
+
+  // Info norm
+  struct idist_calc {
+    static inline Vec map(const Vec& diff, const Vec& p) { return diff; }
+    static inline Vec red(const Vec& agg, const Vec& up) { return vec256::max(agg, up); }
+    static inline scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; }
+    // TODO This backward pass uses a very complext expression to compute (diff
+    // == dist) that could be much faster if using SSE instructions.
+    static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return Vec(grad) * sign(diff) * (Vec(1) - vec256::min(Vec(1), (diff.abs() - Vec(dist)).abs().ceil())); }
+  };
+
+  template <typename F>
+  static void run_parallel(Tensor& result, const Tensor& self, const scalar_t p) {
+    const scalar_t * const self_start = self.data<scalar_t>();
+    const scalar_t * const self_end = self_start + self.numel();
+    int64_t n = self.size(0);
+    int64_t m = self.size(1);
+
+    scalar_t * const res_start = result.data<scalar_t>();
+    int64_t combs = result.numel(); // n * (n - 1) / 2
+    const Vec pvec(p);
+
+    // We conceptually iterate over tuples of (i, j, k) where i is the first
+    // vector from the input, j is the second, and k is the result index. This
+    // parallelizes over the range of k and infers what i and j are from the
+    // value of k.
+    parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [=, &pvec](int64_t k, int64_t end) {
+      float n2 = n - .5;
+      // The -1 accounts for floating point truncation issues
+      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
+      int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
+
+      const scalar_t * self_i = self_start + i * m;
+      const scalar_t * self_j = self_start + j * m;
+      scalar_t * res = res_start + k;
+      const scalar_t * const res_end = res_start + end;
+
+      while (res != res_end) {
+        *res = F::finish(vec256::map2_reduce_all<scalar_t>(
+          [&pvec](Vec a, Vec b) { return F::map((a - b).abs(), pvec); },
+          F::red, self_i, self_j, m), p);
+
+        res += 1;
+        self_j += m;
+        if (self_j == self_end) {
+          self_i += m;
+          self_j = self_i + m;
+        }
+      }
+    });
+  }
+
+  // Assumes self is nonempty, contiguous, and 2D
+  static void apply(Tensor& result, const Tensor& self, const scalar_t p) {
+    if (p == 0.0) {
+      run_parallel<zdist_calc>(result, self, p);
+    } else if (p == 1.0) {
+      run_parallel<odist_calc>(result, self, p);
+    } else if (p == 2.0) {
+      run_parallel<tdist_calc>(result, self, p);
+    } else if (std::isinf(p)) {
+      run_parallel<idist_calc>(result, self, p);
+    } else {
+      run_parallel<pdist_calc>(result, self, p);
+    }
+  }
+
+  template <typename F>
+  inline static void backward_down_column(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size) {
+    for (const scalar_t * const self_end = self_i + m * n; self_i != self_end - m; self_i += m, res_i += m) {
+
+      const Vec self_vec_i = Vec::loadu(self_i, count);
+      Vec res_vec_i = Vec::loadu(res_i, count);
+
+      const scalar_t * self_j = self_i + m;
+      scalar_t * res_j = res_i + m;
+      for (; self_j != self_end; self_j += m, res_j += m, grad_k += gs, dist_k += 1) {
+        const Vec self_vec_j = Vec::loadu(self_j, count);
+        Vec res_vec_j = Vec::loadu(res_j, count);
+
+        Vec res = F::backward(self_vec_i - self_vec_j, *grad_k, *dist_k, pvec);
+        res_vec_i = res_vec_i + res;
+        res_vec_j = res_vec_j - res;
+
+        res_vec_j.store(res_j, count);
+      }
+
+      res_vec_i.store(res_i, count);
+    }
+  }
+
+  template <typename F>
+  static void run_backward_parallel(Tensor& result, const Tensor & grad, const Tensor & self, const scalar_t p, const Tensor& dist) {
+    const int64_t n = self.size(0);
+    const int64_t m = self.size(1);
+    const int64_t gs = grad.stride(0);
+    const Vec pvec(p);
+
+    const scalar_t * const grad_start = grad.data<scalar_t>();
+    const scalar_t * const dist_start = dist.data<scalar_t>();
+    const scalar_t * const self_start = self.data<scalar_t>();
+    scalar_t * const res_start = result.data<scalar_t>();
+
+    // The only way to parallelize and avoid locking requires parallelizing
+    // over the columns of the input, i.e. we compute the gradient for the
+    // first section of each vector independentaly of the second section, etc.
+    at::parallel_for(0, m / Vec::size, internal::GRAIN_SIZE / (8 * n * n), [=, &pvec](int64_t l, int64_t end) {
+      const scalar_t * self_l = self_start + l * Vec::size;
+      scalar_t * res_l = res_start + l * Vec::size;
+
+      for (const scalar_t * const res_end = res_start + end * Vec::size; res_l != res_end; self_l += Vec::size, res_l += Vec::size) {
+        backward_down_column<F>(self_l, res_l, grad_start, dist_start, pvec, n, m, gs);
+      }
+    });
+    const int64_t remainder = m % Vec::size;
+    if (remainder) {
+      backward_down_column<F>(self_start + (m - remainder), res_start + (m - remainder), grad_start, dist_start, pvec, n, m, gs, remainder);
+    }
+  }
+
+  // Assumes self is nonempty, contiguous, and 2D and dist is also contiguous
+  static void apply_backward(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
+    result.fill_(0);
+    if (p == 0.0) {
+    } else if (p == 1.0) {
+      run_backward_parallel<odist_calc>(result, grad, self, p, dist);
+    } else if (p < 2.0) {
+      run_backward_parallel<lttdist_calc>(result, grad, self, p, dist);
+    } else if (p == 2.0) {
+      run_backward_parallel<tdist_calc>(result, grad, self, p, dist);
+    } else if (std::isinf(p)) {
+      run_backward_parallel<idist_calc>(result, grad, self, p, dist);
+    } else {
+      run_backward_parallel<pdist_calc>(result, grad, self, p, dist);
+    }
+  }
+
+};
+
+void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, const double p) {
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist", [&] {
+    PDist<scalar_t>::apply(result, self, p);
+  });
+}
+
+static void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_backward", [&] {
+    PDist<scalar_t>::apply_backward(result, grad, self, p, dist);
+  });
+}
+
+}  // anonymous namespace
+
+REGISTER_DISPATCH(pdist_forward_stub, &pdist_forward_kernel_impl);
+REGISTER_DISPATCH(pdist_backward_stub, &pdist_backward_kernel_impl);
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/CUDAReduceOps.cpp b/aten/src/ATen/native/cuda/CUDAReduceOps.cpp
index a3b32cea21ae0..902b1e16fd6e4 100644
--- a/aten/src/ATen/native/cuda/CUDAReduceOps.cpp
+++ b/aten/src/ATen/native/cuda/CUDAReduceOps.cpp
@@ -3,9 +3,9 @@
 
 namespace at { namespace native {
 
-Tensor _sum_cuda(const Tensor &self_) { return self_._sumall(); }
+Tensor _sum_cuda(const Tensor &self_) { return at::_sumall(self_); }
 
-Tensor _prod_cuda(const Tensor &self_) { return self_._prodall(); }
+Tensor _prod_cuda(const Tensor &self_) { return at::_prodall(self_); }
 
 Tensor &_sum_out_cuda(Tensor &result, const Tensor &self, int64_t dim,
                       bool keepdim) {
diff --git a/aten/src/ATen/native/cuda/DistanceKernel.cu b/aten/src/ATen/native/cuda/DistanceKernel.cu
new file mode 100644
index 0000000000000..02c143254ced7
--- /dev/null
+++ b/aten/src/ATen/native/cuda/DistanceKernel.cu
@@ -0,0 +1,218 @@
+#include "ATen/ATen.h"
+#include <THC/THCTensorMathReduce.cuh>
+#include <math.h>
+
+#include "ATen/native/Distance.h"
+
+
+namespace at { namespace native {
+
+namespace {
+
+static const int forward_threads = 256;
+
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t device_sqrt(scalar_t val);
+
+template <>
+__forceinline__ __device__ float device_sqrt(float val) {
+  return ::sqrtf(val);
+}
+
+template <>
+__forceinline__ __device__ double device_sqrt(double val) {
+  return ::sqrt(val);
+}
+
+template <typename scalar_t>
+struct dists {
+
+  static __forceinline__ __device__ scalar_t sign(scalar_t val) {
+    return (0 < val) - (val < 0);
+  }
+
+  // Zero norm
+  struct zero {
+    static __forceinline__ __device__ void inc(scalar_t& agg, const scalar_t diff, const scalar_t p) { agg += diff != 0.0; }
+    static __forceinline__ __device__ scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; }
+    static __forceinline__ __device__ void agg(scalar_t& update, const scalar_t other) { update += other; }
+  };
+
+  // One norm
+  struct one {
+    static __forceinline__ __device__ void inc(scalar_t& agg, const scalar_t diff, const scalar_t p) { agg += diff; }
+    static __forceinline__ __device__ scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; }
+    static __forceinline__ __device__ void agg(scalar_t& update, const scalar_t other) { update += other; }
+    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return grad * sign(diff); }
+  };
+
+  // Special case backward when p is less than two
+  struct lt_two {
+    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return dist == 0.0 ? 0 : sign(diff) * std::pow(std::abs(diff), p - 1) * grad / std::pow(dist, p - 1); }
+  };
+
+  // Two norm
+  struct two {
+    static __forceinline__ __device__ void inc(scalar_t& agg, const scalar_t diff, const scalar_t p) { agg += diff * diff; }
+    static __forceinline__ __device__ scalar_t finish(const scalar_t agg, const scalar_t p) { return device_sqrt<scalar_t>(agg); }
+    static __forceinline__ __device__ void agg(scalar_t& update, const scalar_t other) { update += other; }
+    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return dist == 0.0 ? 0 : grad * diff / dist; }
+  };
+
+  // General p norm
+  struct p {
+    static __forceinline__ __device__ void inc(scalar_t& agg, const scalar_t diff, const scalar_t p) { agg += std::pow(diff, p); }
+    static __forceinline__ __device__ scalar_t finish(const scalar_t agg, const scalar_t p) { return std::pow(agg, static_cast<scalar_t>(1) / p); }
+    static __forceinline__ __device__ void agg(scalar_t& update, const scalar_t other) { update += other; }
+    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return dist == 0.0 ? 0 : diff * std::pow(std::abs(diff), p - 2) * grad / std::pow(dist, p - 1); }
+  };
+
+  // Inf norm
+  struct inf {
+    static __forceinline__ __device__ void inc(scalar_t& agg, const scalar_t diff, const scalar_t p) { if (diff > agg) { agg = diff; } }
+    static __forceinline__ __device__ scalar_t finish(const scalar_t agg, const scalar_t p) { return agg; }
+    static __forceinline__ __device__ void agg(scalar_t& update, const scalar_t other) { if (other > update) { update = other; } }
+    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return grad * sign(diff) * (std::abs(diff) == dist); }
+  };
+
+};
+
+template <typename scalar_t, typename F>
+__global__ static void pdist_kernel_cuda_impl(scalar_t * result, const scalar_t * self, const int64_t n, const int64_t m, const scalar_t p) {
+  const int k = blockIdx.x;
+  const int stride = blockDim.x;
+
+  float n2 = n - .5;
+  // The -1 accounts for floating point truncation issues
+  int64_t i = static_cast<int64_t>((n2 - device_sqrt<scalar_t>(n2 * n2 - 2 * k - 1)));
+  int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
+
+  const scalar_t * const start = self + i * m;
+  const scalar_t * const end = start + m;
+  const scalar_t * a = start + threadIdx.x;
+  const scalar_t * b = self + j * m + threadIdx.x;
+  scalar_t agg = 0.0;
+  for (; a < end; a += stride, b += stride) {
+    F::inc(agg, std::abs(*a - *b), p);
+  }
+  
+  // Reduce warps
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+    F::agg(agg, WARP_SHFL_DOWN(agg, offset));
+  }
+
+  // Reduce block
+  // This shared memory is significantly larger than necessary, but the
+  // assumption is that it's not a bottleneck, and this is simple
+  __shared__ scalar_t shared[forward_threads];
+  int lane = threadIdx.x % warpSize;
+  int warp_id = threadIdx.x / warpSize;
+  if (lane == 0) {
+    shared[warp_id] = agg;
+  }
+  __syncthreads();
+  agg = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0;
+  if (warp_id == 0) {
+    // Only reduce theads with nonzero data
+    for (int offset = blockDim.x / warpSize / 2; offset > 0; offset /= 2) {
+      F::agg(agg, WARP_SHFL_DOWN(agg, offset));
+    }
+  }
+  if (threadIdx.x == 0) {
+    result[k] = F::finish(agg, p);
+  }
+}
+
+template <typename scalar_t, typename F>
+__global__ static void pdist_backward_kernel_cuda_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * self, const scalar_t * dist, int64_t gs, const int64_t n, const int64_t m, const int64_t combs, const scalar_t p) {
+  const int k = blockIdx.y * blockDim.y + threadIdx.y;
+  const int init = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride = blockDim.x * gridDim.x;
+
+  if (k >= combs) {
+    return;
+  }
+
+  float n2 = n - .5;
+  // The -1 accounts for floating point truncation issues
+  int64_t i = static_cast<int64_t>((n2 - device_sqrt<scalar_t>(n2 * n2 - 2 * k - 1)));
+  int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
+  int64_t ib = j - i - 1;
+  int64_t jb = n - 2 - i;
+
+  const scalar_t grad_k = grad[k * gs];
+  const scalar_t dist_k = dist[k];
+
+  const scalar_t * const start = self + i * m;
+  const scalar_t * const end = start + m;
+  const scalar_t * self_i = start + init;
+  const scalar_t * self_j = self + j * m + init;
+  scalar_t * buff_i = buffer + (ib * n + i) * m + init;
+  scalar_t * buff_j = buffer + (jb * n + j) * m + init;
+  for (; self_i < end; self_i += stride, self_j += stride, buff_i += stride, buff_j += stride) {
+    const scalar_t res = F::backward(*self_i - *self_j, grad_k, dist_k, p);
+    *buff_i = res;
+    *buff_j = -res;
+  }
+}
+
+void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, double p) {
+  const dim3 grid(result.numel());
+  const dim3 block(forward_threads);
+  int64_t n = self.size(0);
+  int64_t m = self.size(1);
+
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda", [&] {
+    if (p == 0.0) {
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+    } else if (p == 1.0) {
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+    } else if (p == 2.0) {
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+    } else if (std::isinf(p)) {
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+    } else {
+      pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(result.data<scalar_t>(), self.data<scalar_t>(), n, m, p);
+    }
+  });
+}
+
+void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
+  if (p == 0.0 || grad.numel() == 0 || self.numel() == 0) {
+    result.fill_(0);
+    return;
+  }
+
+  const int64_t n = result.size(0);
+  int64_t m = self.size(1);
+  const int block_x = 64;
+  const int block_y = 4;
+  const int grid_x = (m + block_x * 8 - 1) / (block_x * 8);
+  const int grid_y = (dist.numel() + block_y - 1) / block_y;
+  const dim3 grid(grid_x, grid_y);
+  const dim3 block(block_x, block_y);
+
+  Tensor buffer = result.type().tensor({n - 1, result.size(0), result.size(1)});
+  AT_DISPATCH_FLOATING_TYPES(self.type(), "pdist_cuda_backward", [&] {
+    if (p == 1.0) {
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+    } else if (p < 2.0) {
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+    } else if (p == 2.0) {
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+    } else if (std::isinf(p)) {
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+    } else {
+      pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block>>>(buffer.data<scalar_t>(), grad.data<scalar_t>(), self.data<scalar_t>(), dist.data<scalar_t>(), grad.stride(0), n, m, dist.numel(), p);
+    }
+  });
+
+  at::sum_out(result, buffer, 0);
+}
+
+} // anonymous namespace
+
+REGISTER_DISPATCH(pdist_forward_stub, &pdist_forward_kernel_impl);
+REGISTER_DISPATCH(pdist_backward_stub, &pdist_backward_kernel_impl);
+
+}} // at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 4b5e758492f76..2b71982be8cf6 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6,185 +6,169 @@
 # specialized operators for each datatype.
 # TODO: remove when we have Type support in the IR
 - func: _cast_Byte(Tensor self, bool non_blocking=false) -> Tensor
-  variants: function, method
+  variants: function
 
 - func: _cast_Char(Tensor self, bool non_blocking=false) -> Tensor
-  variants: function, method
+  variants: function
 
 - func: _cast_Double(Tensor self, bool non_blocking=false) -> Tensor
-  variants: function, method
+  variants: function
 
 - func: _cast_Float(Tensor self, bool non_blocking=false) -> Tensor
-  variants: function, method
+  variants: function
 
 - func: _cast_Int(Tensor self, bool non_blocking=false) -> Tensor
-  variants: function, method
+  variants: function
 
 - func: _cast_Long(Tensor self, bool non_blocking=false) -> Tensor
-  variants: function, method
+  variants: function
 
 - func: _cast_Short(Tensor self, bool non_blocking=false) -> Tensor
-  variants: function, method
+  variants: function
 
 - func: _cast_Half(Tensor self, bool non_blocking=false) -> Tensor
-  variants: function, method
+  variants: function
 
 - func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank, bool deterministic) -> (Tensor, Tensor)
-  variants: function
   dispatch:
     CUDA: _cudnn_ctc_loss
 
 - func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor
-  variants: function
   dispatch:
     CUDA: _cudnn_rnn_flatten_weight
 
 - func: _cudnn_rnn(Tensor input, TensorList weight, int64_t weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
-  variants: function
   dispatch:
     CUDA: _cudnn_rnn
 
 - func: _cudnn_rnn_backward(Tensor input, TensorList weight, int64_t weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state, Tensor reserve, std::array<bool,4> output_mask) -> (Tensor, Tensor, Tensor, TensorList)
-  variants: function
   dispatch:
     CUDA: _cudnn_rnn_backward
 
 - func: _cudnn_init_dropout_state(Type self_ty, double dropout, bool train, int64_t dropout_seed) -> Tensor
-  variants: function
   dispatch:
     CUDA: _cudnn_init_dropout_state
 
 - func: _fused_dropout(Tensor self, double p, Generator* generator=nullptr) -> (Tensor, Tensor)
+  variants: function
   dispatch:
      CUDA: fused_dropout_cuda
 
 - func: _masked_scale(Tensor self, Tensor mask, double scale) -> Tensor
+  variants: function
   dispatch:
      CUDA: masked_scale_cuda
 
 - func: dropout(Tensor input, double p, bool train) -> Tensor
-  variants: function
 
 - func: dropout_(Tensor self, double p, bool train) -> Tensor
-  variants: function
 
 - func: feature_dropout(Tensor input, double p, bool train) -> Tensor
-  variants: function
 
 - func: feature_dropout_(Tensor self, double p, bool train) -> Tensor
-  variants: function
 
 - func: alpha_dropout(Tensor input, double p, bool train) -> Tensor
-  variants: function
 
 - func: alpha_dropout_(Tensor self, double p, bool train) -> Tensor
-  variants: function
 
 - func: feature_alpha_dropout(Tensor input, double p, bool train) -> Tensor
-  variants: function
 
 - func: feature_alpha_dropout_(Tensor self, double p, bool train) -> Tensor
-  variants: function
 
 - func: abs(Tensor self) -> Tensor
+  variants: function, method
 
 - func: abs_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _abs__cpu
     CUDA: _abs__cuda
 
 - func: abs_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _abs_out_cpu
     CUDA: _abs_out_cuda
 
 - func: acos(Tensor self) -> Tensor
+  variants: function, method
 
 - func: acos_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _acos__cpu
     CUDA: _acos__cuda
 
 - func: acos_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _acos_out_cpu
     CUDA: _acos_out_cuda
 
 - func: avg_pool1d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, bool ceil_mode=false, bool count_include_pad=true) -> Tensor
-  variants: function
 
 - func: adaptive_avg_pool1d(Tensor self, IntList[1] output_size) -> Tensor
-  variants: function
 
 - func: adaptive_max_pool1d(Tensor self, IntList[1] output_size) -> (Tensor, Tensor)
-  variants: function
 
 - func: add(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+  variants: function, method
 
 - func: add_(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   variants: method
 
 - func: add_out(Tensor result, Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-  variants: function
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: add(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  variants: function, method
+
 - func: add_(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
   variants: method
 
 - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function, method
 
 - func: addmv_(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function, method
 
 - func: addmv_out(Tensor result, Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  variants: function
 
 - func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function, method
 
 - func: addr_(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   variants: method
 
 - func: addr_out(Tensor result, Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  variants: function
 
 - func: all(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+  variants: function, method
 
 - func: all_out(Tensor result, Tensor self, int64_t dim, bool keepdim=false) -> Tensor
-  variants: function
 
 - func: allclose(Tensor self, Tensor other, double rtol=1e-5, double atol=1e-8, bool equal_nan=False) -> bool
+  variants: function, method
 
 - func: any(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+  variants: function, method
 
 - func: any_out(Tensor result, Tensor self, int64_t dim, bool keepdim=false) -> Tensor
-  variants: function
 
 - func: arange(Scalar start, Scalar end, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: arange(Scalar start, Scalar end, Scalar step, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: arange_out(Tensor result, Scalar start, Scalar end) -> Tensor
-  variants: function
 
 - func: arange_out(Tensor result, Scalar start, Scalar end, Scalar step) -> Tensor
-  variants: function
 
 - func: arange(Scalar end, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: arange_out(Tensor result, Scalar end) -> Tensor
-  variants: function
 
 - func: arange(Type dtype, Scalar start, Scalar end, Scalar step=1) -> Tensor
-  variants: function
   deprecated: true
 
 - func: arange(Type dtype, Scalar end) -> Tensor
-  variants: function
   deprecated: true
 
 # This function is a temporary hack to allow tracing of arange like constructs with dynamic
@@ -193,7 +177,6 @@
 # preserve tracing.  Get rid of this when arange can directly take tensors for bounds
 # (so that it can be traced directly).
 - func: _dim_arange(Tensor like, int64_t dim) -> Tensor
-  variants: function
 
 # `argmin` and `argmax` are exposed in C++ but not in Python, where we only
 # expose `_argmin` and `_argmax` (which call the first versions). In Python, we
@@ -201,116 +184,134 @@
 # which gets the argmax/argmin of the flattened array.
 
 - func: argmax(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+  variants: function, method
+
 - func: argmax(Tensor self) -> Tensor
+  variants: function, method
+
 - func: _argmax(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+  variants: function
 
 - func: argmin(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+  variants: function, method
+
 - func: argmin(Tensor self) -> Tensor
+  variants: function, method
+
 - func: _argmin(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+  variants: function
 
 - func: as_strided(Tensor self, IntList size, IntList stride) -> Tensor
+  variants: function, method
 
 - func: as_strided_(Tensor self, IntList size, IntList stride) -> Tensor
+  variants: function, method
 
 - func: as_strided(Tensor self, IntList size, IntList stride, int64_t storage_offset) -> Tensor
+  variants: function, method
   python_default_init:
     storage_offset: self.storage_offset()
 
 - func: as_strided_(Tensor self, IntList size, IntList stride, int64_t storage_offset) -> Tensor
+  variants: function, method
   python_default_init:
     storage_offset: self.storage_offset()
 
 - func: asin(Tensor self) -> Tensor
+  variants: function, method
 
 - func: asin_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _asin__cpu
     CUDA: _asin__cuda
 
 - func: asin_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _asin_out_cpu
     CUDA: _asin_out_cuda
 
 - func: atan(Tensor self) -> Tensor
+  variants: function, method
 
 - func: atan_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _atan__cpu
     CUDA: _atan__cuda
 
 - func: atan_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _atan_out_cpu
     CUDA: _atan_out_cuda
 
 - func: bartlett_window(int64_t window_length, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: bartlett_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, double momentum, double eps, bool cudnn_enabled) -> Tensor
-  variants: function
 
 - func: bernoulli(Tensor self, Tensor p, Generator* generator=nullptr) -> Tensor
+  variants: function, method
 
 - func: bernoulli(Tensor self, double p, Generator* generator=nullptr) -> Tensor
+  variants: function, method
 
 - func: bernoulli(Tensor self) -> Tensor
+  variants: function, method
 
 - func: bernoulli_(Tensor self, Tensor p, Generator* generator=nullptr) -> Tensor
+  variants: function, method
 
 - func: bernoulli_(Tensor self, double p, Generator* generator=nullptr) -> Tensor
+  variants: function, method
 
 - func: bernoulli_(Tensor self) -> Tensor
+  variants: function, method
 
 - func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias) -> Tensor
-  variants: function
 
 - func: bincount(Tensor self, Tensor? weights={}, int64_t minlength=0) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _bincount_cpu
     CUDA: _bincount_cuda
 
 - func: blackman_window(int64_t window_length, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: blackman_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: broadcast_tensors(TensorList tensors) -> TensorList
-  variants: function
 
 - func: cat(TensorList tensors, int64_t dim=0) -> Tensor
-  variants: function
 
 - func: cat_out(Tensor result, TensorList tensors, int64_t dim=0) -> Tensor
-  variants: function
 
 - func: ceil(Tensor self) -> Tensor
+  variants: function, method
 
 - func: ceil_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _ceil__cpu
     CUDA: _ceil__cuda
 
 - func: ceil_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _ceil_out_cpu
     CUDA: _ceil_out_cuda
 
 - func: chunk(Tensor self, int64_t chunks, int64_t dim=0) -> TensorList
+  variants: function, method
 
 - func: clamp(Tensor self, Scalar min, Scalar max) -> Tensor
+  variants: function, method
   python_default_init:
     min: NAN
     max: NAN
 
 - func: clamp_(Tensor self, Scalar min, Scalar max) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _clamp__cpu
     CUDA: _clamp__cuda
@@ -319,7 +320,6 @@
     max: NAN
 
 - func: clamp_out(Tensor result, Tensor self, Scalar min, Scalar max) -> Tensor
-  variants: function
   dispatch:
     CPU: _clamp_out_cpu
     CUDA: _clamp_out_cuda
@@ -328,108 +328,97 @@
     max: NAN
 
 - func: clamp_max(Tensor self, Scalar max) -> Tensor
+  variants: function, method
 
 - func: clamp_max_(Tensor self, Scalar max) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _clamp_max__cpu
     CUDA: _clamp_max__cuda
 
 - func: clamp_max_out(Tensor result, Tensor self, Scalar max) -> Tensor
-  variants: function
   dispatch:
     CPU: _clamp_max_out_cpu
     CUDA: _clamp_max_out_cuda
 
 - func: clamp_min(Tensor self, Scalar min) -> Tensor
+  variants: function, method
 
 - func: clamp_min_(Tensor self, Scalar min) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _clamp_min__cpu
     CUDA: _clamp_min__cuda
 
 - func: clamp_min_out(Tensor result, Tensor self, Scalar min) -> Tensor
-  variants: function
   dispatch:
     CPU: _clamp_min_out_cpu
     CUDA: _clamp_min_out_cuda
 
 - func: cudnn_is_acceptable(Tensor self) -> bool
-  variants: function
   device_guard: false
 
 - func: convolution(Tensor input, Tensor weight, Tensor? bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups) -> Tensor
-  variants: function
 
 - func: _convolution(Tensor input, Tensor weight, Tensor? bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor
-  variants: function
 
 - func: _convolution_nogroup(Tensor input, Tensor weight, Tensor? bias, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding) -> Tensor
-  variants: function
 
 # NB: We MUST call the input self, otherwise codegen will attempt to
 # dispatch on ggI... which might be undefined.
 - func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, IntList stride, IntList padding, IntList dilation, bool transposed, IntList output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
-  variants: function
 
 - func: conv1d(Tensor input, Tensor weight, Tensor bias={}, IntList[1] stride=1, IntList[1] padding=0, IntList[1] dilation=1, int64_t groups=1) -> Tensor
-  variants: function
 
 - func: conv2d(Tensor input, Tensor weight, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0, IntList[2] dilation=1, int64_t groups=1) -> Tensor
-  variants: function
 
 - func: conv3d(Tensor input, Tensor weight, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] dilation=1, int64_t groups=1) -> Tensor
-  variants: function
 
 - func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad) -> Tensor
-  variants: function
 
 - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int64_t pad) -> (Tensor, Tensor, Tensor)
-  variants: function
 
 # NB: we inherit the goofy argument order from PyTorch torch.nn.functional
 - func: conv_transpose1d(Tensor input, Tensor weight, Tensor bias={}, IntList[1] stride=1, IntList[1] padding=0, IntList[1] output_padding=0, int64_t groups=1, IntList[1] dilation=1) -> Tensor
-  variants: function
 
 - func: conv_transpose2d(Tensor input, Tensor weight, Tensor bias={}, IntList[2] stride=1, IntList[2] padding=0, IntList[2] output_padding=0, int64_t groups=1, IntList[2] dilation=1) -> Tensor
-  variants: function
 
 - func: conv_transpose3d(Tensor input, Tensor weight, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] output_padding=0, int64_t groups=1, IntList[3] dilation=1) -> Tensor
-  variants: function
 
 - func: cos(Tensor self) -> Tensor
+  variants: function, method
 
 - func: cos_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _cos__cpu
     CUDA: _cos__cuda
 
 - func: cos_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _cos_out_cpu
     CUDA: _cos_out_cuda
 
 - func: cosh(Tensor self) -> Tensor
+  variants: function, method
 
 - func: cosh_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _cosh__cpu
     CUDA: _cosh__cuda
 
 - func: cosh_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _cosh_out_cpu
     CUDA: _cosh_out_cuda
 
 - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, double margin=0.0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
-  variants: function
 
 - func: cudnn_affine_grid_generator(Tensor theta, int64_t N, int64_t C, int64_t H, int64_t W) -> Tensor
   return:
     - type: Tensor
       name: grid
-  variants: function
   dispatch:
     CUDA: cudnn_affine_grid_generator_forward
 
@@ -438,70 +427,57 @@
   return:
     - type: Tensor
       name: grad_theta
-  variants: function
   dispatch:
     CUDA: cudnn_affine_grid_generator_backward
 
 - func: cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, double exponential_average_factor, double epsilon) -> (Tensor, Tensor, Tensor)
-  variants: function
   dispatch:
     CUDA: cudnn_batch_norm
 
 # NB: You can only use this if you used cudnn_batch_norm training=True
 - func: cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, double epsilon) -> (Tensor, Tensor, Tensor)
-  variants: function
   dispatch:
     CUDA: cudnn_batch_norm_backward
 
 - func: cudnn_convolution(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
-  variants: function
   dispatch:
     CUDA: cudnn_convolution
 
 - func: cudnn_convolution_backward_input(IntList self_size, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
-  variants: function
   dispatch:
     CUDA: cudnn_convolution_backward_input
 
 - func: cudnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
-  variants: function
   dispatch:
     CUDA: cudnn_convolution_backward
 
 - func: cudnn_convolution_backward_bias(Tensor grad_output) -> Tensor
-  variants: function
   dispatch:
     CUDA: cudnn_convolution_backward_bias
 
 - func: cudnn_convolution_backward_weight(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
-  variants: function
   dispatch:
     CUDA: cudnn_convolution_backward_weight
 
 - func: cudnn_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
-  variants: function
   dispatch:
     CUDA: cudnn_convolution_transpose
 
 # NB: output_padding not strictly needed here, but it's helpful for the double
 # backwards
 - func: cudnn_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
-  variants: function
   dispatch:
     CUDA: cudnn_convolution_transpose_backward
 
 - func: cudnn_convolution_transpose_backward_bias(Tensor grad_output) -> Tensor
-  variants: function
   dispatch:
     CUDA: cudnn_convolution_backward_bias
 
 - func: cudnn_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
-  variants: function
   dispatch:
     CUDA: cudnn_convolution_transpose_backward_input
 
 - func: cudnn_convolution_transpose_backward_weight(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
-  variants: function
   dispatch:
     CUDA: cudnn_convolution_transpose_backward_weight
 
@@ -510,7 +486,6 @@
   return:
     - type: Tensor
       name: output
-  variants: function
   dispatch:
     CUDA: cudnn_grid_sampler_forward
 
@@ -520,99 +495,92 @@
       name: grad_self
     - type: Tensor
       name: grad_grid
-  variants: function
   dispatch:
     CUDA: cudnn_grid_sampler_backward
 
 # FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
 - func: cumsum(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
+  variants: function, method
 
 - func: cumsum(Tensor self, int64_t dim) -> Tensor
+  variants: function, method
 
 - func: cumsum_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
-  variants: function
 
 - func: cumsum_out(Tensor result, Tensor self, int64_t dim) -> Tensor
-  variants: function
 
 # FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
 - func: cumprod(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
+  variants: function, method
 
 - func: cumprod(Tensor self, int64_t dim) -> Tensor
+  variants: function, method
 
 - func: cumprod_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
-  variants: function
 
 - func: cumprod_out(Tensor result, Tensor self, int64_t dim) -> Tensor
-  variants: function
 
 - func: ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
-  variants: function
 
 # convenience function that converts to intlists for you
 - func: ctc_loss(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int64_t blank=0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
-  variants: function
 
 - func: _ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0) -> (Tensor, Tensor)
-  variants: function
   dispatch:
     CPU:  ctc_loss_cpu
     CUDA: ctc_loss_gpu
 
 - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int64_t blank) -> Tensor
-  variants: function
   dispatch:
     CPU: ctc_loss_backward_cpu
     CUDA: ctc_loss_backward_gpu
 
 - func: det(Tensor self) -> Tensor
+  variants: function, method
 
 - func: diagflat(Tensor self, int64_t offset=0) -> Tensor
+  variants: function, method
 
 - func: diagonal(Tensor self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) -> Tensor
+  variants: function, method
 
 - func: div(Tensor self, Tensor other) -> Tensor
-  variants: method, function
+  variants: function, method
 
 - func: div_(Tensor self, Tensor other) -> Tensor
   variants: method
 
 - func: div_out(Tensor result, Tensor self, Tensor other) -> Tensor
-  variants: function
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: div(Tensor self, Scalar other) -> Tensor
+  variants: function, method
+
 - func: div_(Tensor self, Scalar other) -> Tensor
   variants: method
 
 - func: dot(Tensor self, Tensor tensor) -> Tensor
+  variants: function, method
 
 - func: dot_out(Tensor result, Tensor self, Tensor tensor) -> Tensor
-  variants: function
 
 - func: einsum(std::string equation, TensorList tensors) -> Tensor
-  variants: function
 
 - func: embedding(Tensor weight, IndexTensor indices, int64_t padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false) -> Tensor
-  variants: function
 
 - func: embedding_backward(Tensor grad, IndexTensor indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
-  variants: function
 
 - func: embedding_dense_backward(Tensor grad, IndexTensor indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) -> Tensor
-  variants: function
   dispatch:
     CPU: embedding_dense_backward_cpu
     CUDA: embedding_dense_backward_cuda
 
 - func: embedding_renorm_(Tensor self, IndexTensor indices, double max_norm, double norm_type) -> Tensor
-  variants: function
   dispatch:
     CPU: embedding_renorm_cpu_
     CUDA: embedding_renorm_cuda_
 
 - func: embedding_sparse_backward(Tensor grad, IndexTensor indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) -> Tensor
-  variants: function
 
 # NOTE [ embedding_bag Native Functions ]
 # The `_embedding_bag.*` variants assume that input tensors except for `weight`,
@@ -624,90 +592,84 @@
 # The backward functions apply a check that these input tensors are contiguous.
 
 - func: embedding_bag(Tensor weight, IndexTensor indices, IndexTensor offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false) -> (Tensor, Tensor, Tensor, Tensor)
-  variants: function
 
 - func: _embedding_bag(Tensor weight, IndexTensor indices, IndexTensor offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false) -> (Tensor, Tensor, Tensor, Tensor)
-  variants: function
   dispatch:
     CPU: _embedding_bag_cpu
     CUDA: _embedding_bag_cuda
 
 - func: _embedding_bag_backward(Tensor grad, IndexTensor indices, IndexTensor offsets, IndexTensor offset2bag, IndexTensor bag_size, IndexTensor maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse) -> Tensor
-  variants: function
 
 - func: _embedding_bag_sparse_backward(Tensor grad, IndexTensor indices, IndexTensor offsets, IndexTensor offset2bag, IndexTensor bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) -> Tensor
-  variants: function
 
 - func: _embedding_bag_dense_backward(Tensor grad, IndexTensor indices, IndexTensor offsets, IndexTensor offset2bag, IndexTensor bag_size, IndexTensor maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode) -> Tensor
-  variants: function
   dispatch:
     CPU: _embedding_bag_dense_backward_cpu
     CUDA: _embedding_bag_dense_backward_cuda
 
 - func: empty(IntList size, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: empty_out(Tensor result, IntList size) -> Tensor
-  variants: function
 
 - func: empty_like(Tensor self) -> Tensor
-  variants: function
 
 - func: empty_like(Tensor self, *, TensorOptions options) -> Tensor
-  variants: function
 
 - func: empty(Type dtype, IntList size) -> Tensor
-  variants: function
   deprecated: true
 
 - func: erf(Tensor self) -> Tensor
+  variants: function, method
 
 - func: erf_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _erf__cpu
     CUDA: _erf__cuda
 
 - func: erf_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _erf_out_cpu
     CUDA: _erf_out_cuda
 
 - func: erfc(Tensor self) -> Tensor
+  variants: function, method
 
 - func: erfc_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _erfc__cpu
     CUDA: _erfc__cuda
 
 - func: erfc_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _erfc_out_cpu
     CUDA: _erfc_out_cuda
 
 - func: exp(Tensor self) -> Tensor
+  variants: function, method
 
 - func: exp_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _exp__cpu
     CUDA: _exp__cuda
 
 - func: exp_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _exp_out_cpu
     CUDA: _exp_out_cuda
 
 - func: expm1(Tensor self) -> Tensor
+  variants: function, method
 
 - func: expm1_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _expm1__cpu
     CUDA: _expm1__cuda
 
 - func: expm1_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _expm1_out_cpu
     CUDA: _expm1_out_cuda
@@ -719,60 +681,54 @@
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
 
 - func: eye(int64_t n, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: eye(int64_t n, int64_t m, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: eye_out(Tensor result, int64_t n) -> Tensor
-  variants: function
   dispatch:
     CPU: eye_out_cpu
     CUDA: eye_out_cuda
 
 - func: eye_out(Tensor result, int64_t n, int64_t m) -> Tensor
-  variants: function
   dispatch:
     CPU: eye_out_cpu
     CUDA: eye_out_cuda
 
 - func: eye(Type dtype, int64_t n, int64_t m=-1) -> Tensor
-  variants: function
   deprecated: true
 
 - func: flatten(Tensor self, int64_t start_dim=0, int64_t end_dim=-1) -> Tensor
+  variants: function, method
 
 - func: fill_(Tensor self, Scalar value) -> Tensor
+  variants: function, method
 
 - func: fill_(Tensor self, Tensor value) -> Tensor
+  variants: function, method
 
 - func: floor(Tensor self) -> Tensor
+  variants: function, method
 
 - func: floor_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _floor__cpu
     CUDA: _floor__cuda
 
 - func: floor_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _floor_out_cpu
     CUDA: _floor_out_cuda
 
 - func: full(IntList size, Scalar fill_value, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: full_out(Tensor result, IntList size, Scalar fill_value) -> Tensor
-  variants: function
 
 - func: full_like(Tensor self, Scalar fill_value) -> Tensor
-  variants: function
 
 - func: full_like(Tensor self, Scalar fill_value, *, TensorOptions options) -> Tensor
-  variants: function
 
 - func: full(Type dtype, IntList size, Scalar fill_value) -> Tensor
-  variants: function
   deprecated: true
 
 # NOTE [ grid_sampler Native Functions ]
@@ -785,206 +741,205 @@
 # enums defined in `native/GridSampler.h`. `cudnn_grid_sampler` doesn't take in
 # `interpolation_mode` because it only supports Bilinear interpolation mode.
 - func: grid_sampler(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> Tensor
-  variants: function
 
 - func: grid_sampler_2d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> Tensor
-  variants: function
   dispatch:
     CPU: grid_sampler_2d_cpu
     CUDA: grid_sampler_2d_cuda
 
 - func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> (Tensor, Tensor)
-  variants: function
   dispatch:
     CPU: grid_sampler_2d_backward_cpu
     CUDA: grid_sampler_2d_backward_cuda
 
 - func: grid_sampler_3d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> Tensor
-  variants: function
   dispatch:
     CPU: grid_sampler_3d_cpu
     CUDA: grid_sampler_3d_cuda
 
 - func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> (Tensor, Tensor)
-  variants: function
   dispatch:
     CPU: grid_sampler_3d_backward_cpu
     CUDA: grid_sampler_3d_backward_cuda
 
 - func: hann_window(int64_t window_length, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: hann_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: hamming_window(int64_t window_length, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: hamming_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: hamming_window(int64_t window_length, bool periodic, double alpha, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: hamming_window(int64_t window_length, bool periodic, double alpha, double beta, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: hinge_embedding_loss(Tensor self, Tensor target, double margin=1.0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
-  variants: function
 
 - func: ger(Tensor self, Tensor vec2) -> Tensor
+  variants: function, method
 
 - func: ger_out(Tensor result, Tensor self, Tensor vec2) -> Tensor
-  variants: function
 
 - func: gesv(Tensor self, Tensor A) -> (Tensor, Tensor)
+  variants: function, method
 
 - func: gesv_out(Tensor solution, Tensor lu, Tensor self, Tensor A) -> (Tensor, Tensor)
-  variants: function
 
 # gesv handles broadcasting of arbitrary batch dims while _gesv_helper does not.
 - func: _gesv_helper(Tensor self, Tensor A) -> (Tensor, Tensor)
+  variants: function
   dispatch:
     CPU: _gesv_helper_cpu
     CUDA: _gesv_helper_cuda
 
 - func: group_norm(Tensor input, int64_t num_groups, Tensor? weight={}, Tensor? bias={}, double eps=1e-5, bool cudnn_enabled=True) -> Tensor
-  variants: function
 
 # FFT
 
 - func: fft(Tensor self, int64_t signal_ndim, bool normalized=false) -> Tensor
+  variants: function, method
 
 - func: ifft(Tensor self, int64_t signal_ndim, bool normalized=false) -> Tensor
+  variants: function, method
 
 - func: rfft(Tensor self, int64_t signal_ndim, bool normalized=false, bool onesided=true) -> Tensor
+  variants: function, method
 
 - func: irfft(Tensor self, int64_t signal_ndim, bool normalized=false, bool onesided=true, IntList signal_sizes={}) -> Tensor
+  variants: function, method
 
 - func: _fft_with_size(Tensor self, int64_t signal_ndim, bool complex_input, bool complex_output, bool inverse, IntList checked_signal_sizes, bool normalized, bool onesided, IntList output_sizes) -> Tensor
+  variants: function
   dispatch:
     CPU: _fft_mkl
     CUDA: _fft_cufft
 
 - func: _cufft_get_plan_cache_size() -> int64_t
-  variants: function
   device_guard: false
 
 - func: _cufft_get_plan_cache_max_size() -> int64_t
-  variants: function
   device_guard: false
 
 - func: _cufft_set_plan_cache_max_size(int64_t max_size)
-  variants: function
   device_guard: false
 
 - func: _cufft_clear_plan_cache()
-  variants: function
   device_guard: false
 
 - func: index(Tensor self, TensorList indices) -> Tensor
+  variants: function, method
   # NB: This function is special-cased in tools/autograd/gen_variable_type.py
 
 - func: index_copy_(Tensor self, int64_t dim, IndexTensor index, Tensor source) -> Tensor
   variants: method
 
 - func: index_put(Tensor self, TensorList indices, Tensor values) -> Tensor
+  variants: function, method
 
 - func: index_put_(Tensor self, TensorList indices, Tensor values) -> Tensor
+  variants: function, method
 
 - func: inverse(Tensor self) -> Tensor
+  variants: function, method
 
 - func: inverse_out(Tensor result, Tensor self) -> Tensor
-  variants: function
 
 - func: isclose(Tensor self, Tensor other, double rtol=1e-5, double atol=1e-8, bool equal_nan=False) -> Tensor
+  variants: function, method
 
 - func: is_cuda(Tensor self) -> bool
+  variants: function, method
   device_guard: false
 
 - func: is_distributed(Tensor self) -> bool
+  variants: function, method
   device_guard: false
 
 - func: is_floating_point(Tensor self) -> bool
+  variants: function, method
+  device_guard: false
+
+- func: is_complex(Tensor self) -> bool
+  variants: function, method
   device_guard: false
 
 - func: is_nonzero(Tensor self) -> bool
+  variants: function, method
   device_guard: false
 
 - func: is_same_size(Tensor self, Tensor other) -> bool
+  variants: function, method
   device_guard: false
 
 - func: is_signed(Tensor self) -> bool
+  variants: function, method
   device_guard: false
 
 - func: is_sparse(Tensor self) -> bool
+  variants: function, method
   device_guard: false
 
 - func: kl_div(Tensor self, Tensor target, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
-  variants: function
 
 - func: kl_div_backward(Tensor grad_output, Tensor self, Tensor target, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
-  variants: function
   dispatch:
     CPU: kl_div_backward_cpu
     CUDA: kl_div_backward_cuda
 
 - func: kthvalue(Tensor self, int64_t k, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor)
+  variants: function, method
 
 - func: kthvalue_out(Tensor values, Tensor indices, Tensor self, int64_t k, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor)
-  variants: function
 
 - func: layer_norm(Tensor input, IntList normalized_shape, Tensor? weight={}, Tensor? bias={}, double eps=1e-5, bool cudnn_enable=True) -> Tensor
-  variants: function
 
 - func: linear(Tensor input, Tensor weight, Tensor bias={}) -> Tensor
-  variants: function
 
 - func: linspace(Scalar start, Scalar end, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: linspace(Scalar start, Scalar end, int64_t steps, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: linspace_out(Tensor result, Scalar start, Scalar end) -> Tensor
-  variants: function
 
 - func: linspace_out(Tensor result, Scalar start, Scalar end, int64_t steps) -> Tensor
-  variants: function
 
 - func: linspace(Type dtype, Scalar start, Scalar end, int64_t steps=100) -> Tensor
-  variants: function
   deprecated: true
 
 - func: log(Tensor self) -> Tensor
+  variants: function, method
 
 - func: log_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _log__cpu
     CUDA: _log__cuda
 
 - func: log_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _log_out_cpu
     CUDA: _log_out_cuda
 
 - func: log10(Tensor self) -> Tensor
+  variants: function, method
 
 - func: log10_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _log10__cpu
     CUDA: _log10__cuda
 
 - func: log10_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _log10_out_cpu
     CUDA: _log10_out_cuda
 
 - func: log1p(Tensor self) -> Tensor
+  variants: function, method
 
 - func: log1p_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _log1p__cpu
     CUDA: _log1p__cuda
@@ -992,7 +947,6 @@
     SparseCUDA: log1p_sparse_
 
 - func: log1p_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _log1p_out_cpu
     CUDA: _log1p_out_cuda
@@ -1000,573 +954,518 @@
     SparseCUDA: log1p_out_sparse
 
 - func: log2(Tensor self) -> Tensor
+  variants: function, method
 
 - func: log2_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _log2__cpu
     CUDA: _log2__cuda
 
 - func: log2_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _log2_out_cpu
     CUDA: _log2_out_cuda
 
 - func: logdet(Tensor self) -> Tensor
+  variants: function, method
 
 - func: logspace(Scalar start, Scalar end, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: logspace(Scalar start, Scalar end, int64_t steps, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: logspace_out(Tensor result, Scalar start, Scalar end) -> Tensor
-  variants: function
 
 - func: logspace_out(Tensor result, Scalar start, Scalar end, int64_t steps) -> Tensor
-  variants: function
 
 - func: logspace(Type dtype, Scalar start, Scalar end, int64_t steps=100) -> Tensor
-  variants: function
   deprecated: true
 
 - func: log_softmax(Tensor self, int64_t dim) -> Tensor
+  variants: function, method
   dispatch:
     CPU: log_softmax_cpu
     CUDA: log_softmax_cuda
 
 - func: log_softmax_backward_data(Tensor grad_output, Tensor output, int64_t dim, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: log_softmax_backward_cpu
     CUDA: log_softmax_backward_cuda
 
 - func: logsumexp(Tensor self, int64_t dim, bool keepdim=False) -> Tensor
+  variants: function, method
 
 - func: logsumexp_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor
-  variants: function
-
 
 - func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, double margin=0.0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
-  variants: function
 
 - func: matmul(Tensor self, Tensor other) -> Tensor
+  variants: function, method
 
 - func: matmul_out(Tensor result, Tensor self, Tensor other) -> Tensor
-  variants: function
 
 - func: matrix_rank(Tensor self, double tol, bool symmetric=false) -> Tensor
-  variants: function
 
 - func: matrix_rank(Tensor self, bool symmetric=false) -> Tensor
-  variants: function
+
+- func: matrix_power(Tensor self, int64_t n) -> Tensor
+  variants: function, method
 
 - func: max(Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
+  variants: function, method
 
 - func: max_out(Tensor max, Tensor max_values, Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
-  variants: function
 
 - func: max_values(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+  variants: function, method
 
 - func: max_pool1d_with_indices(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> (Tensor, Tensor)
-  variants: function
 
 - func: max_pool1d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor
-  variants: function
 
 - func: max_pool2d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor
-  variants: function
 
 - func: max_pool3d(Tensor self, IntList[1] kernel_size, IntList[1] stride={}, IntList[1] padding=0, IntList[1] dilation=1, bool ceil_mode=false) -> Tensor
-  variants: function
 
 # FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
 - func: mean(Tensor self, *, ScalarType dtype) -> Tensor
+  variants: function, method
 
 - func: mean(Tensor self) -> Tensor
+  variants: function, method
 
 - func: mean(Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  variants: function, method
 
 - func: mean(Tensor self, int64_t dim, bool keepdim=False) -> Tensor
+  variants: function, method
 
 - func: mean(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
+  variants: function, method
 
 - func: mean_out(Tensor result, Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor
-  variants: function
 
 - func: mean_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor
-  variants: function
 
 - func: mean_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
-  variants: function
 
 - func: median(Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
+  variants: function, method
 
 - func: median_out(Tensor values, Tensor indices, Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
-  variants: function
 
 - func: min(Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
+  variants: function, method
 
 - func: min_out(Tensor min, Tensor min_indices, Tensor self, int64_t dim, bool keepdim=false) -> (Tensor, Tensor)
-  variants: function
 
 - func: min_values(Tensor self, int64_t dim, bool keepdim=false) -> Tensor
+  variants: function, method
 
 - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList stride, IntList dilation, int64_t groups) -> Tensor
-  variants: function
 
 - func: mkldnn_convolution_backward_input(IntList self_size, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) -> Tensor
-  variants: function
 
 - func: mkldnn_convolution_backward_weights(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool bias_defined) -> (Tensor, Tensor)
-  variants: function
 
 - func: mkldnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
-  variants: function
 
 - func: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, double exponential_average_factor, double epsilon) -> (Tensor, Tensor, Tensor)
-  variants: function
   dispatch:
     CUDA: miopen_batch_norm
 
 - func: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, double epsilon) -> (Tensor, Tensor, Tensor)
-  variants: function
   dispatch:
     CUDA: miopen_batch_norm_backward
 
 - func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
-  variants: function
   dispatch:
     CUDA: miopen_convolution
 
 - func: miopen_convolution_backward_input(IntList self_size, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
-  variants: function
   dispatch:
     CUDA: miopen_convolution_backward_input
 
 - func: miopen_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
-  variants: function
   dispatch:
     CUDA: miopen_convolution_backward
 
 - func: miopen_convolution_backward_bias(Tensor grad_output) -> Tensor
-  variants: function
   dispatch:
     CUDA: miopen_convolution_backward_bias
 
 - func: miopen_convolution_backward_weight(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
-  variants: function
   dispatch:
     CUDA: miopen_convolution_backward_weight
 
 - func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
-  variants: function
   dispatch:
     CUDA: miopen_convolution_transpose
 
 # NB: output_padding not strictly needed here, but it's helpful for the double
 # backwards
 - func: miopen_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic, std::array<bool,3> output_mask) -> (Tensor, Tensor, Tensor)
-  variants: function
   dispatch:
     CUDA: miopen_convolution_transpose_backward
 
 - func: miopen_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
-  variants: function
   dispatch:
     CUDA: miopen_convolution_transpose_backward_input
 
 - func: miopen_convolution_transpose_backward_weight(IntList weight_size, Tensor grad_output, Tensor self, IntList padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic) -> Tensor
-  variants: function
   dispatch:
     CUDA: miopen_convolution_transpose_backward_weight
 
 - func: mm(Tensor self, Tensor mat2) -> Tensor
+  variants: function, method
 
 - func: mm_out(Tensor result, Tensor self, Tensor mat2) -> Tensor
-  variants: function
 
 - func: mode(Tensor self, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor)
+  variants: function, method
 
 - func: mode_out(Tensor values, Tensor indices, Tensor self, int64_t dim=-1, bool keepdim=false) -> (Tensor, Tensor)
-  variants: function
 
 - func: mul(Tensor self, Tensor other) -> Tensor
-  variants: method, function
+  variants: function, method
 
 - func: mul_(Tensor self, Tensor other) -> Tensor
   variants: method
 
 - func: mul_out(Tensor result, Tensor self, Tensor other) -> Tensor
-  variants: function
 
   # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: mul(Tensor self, Scalar other) -> Tensor
+  variants: function, method
+
 - func: mul_(Tensor self, Scalar other) -> Tensor
   variants: method
 
 - func: mv(Tensor self, Tensor vec) -> Tensor
+  variants: function, method
 
 - func: mv_out(Tensor result, Tensor self, Tensor vec) -> Tensor
-  variants: function
 
 - func: mvlgamma(Tensor self, int64_t p) -> Tensor
+  variants: function, method
 
 - func: mvlgamma_(Tensor self, int64_t p) -> Tensor
   variants: method
 
 - func: narrow(Tensor self, int64_t dim, int64_t start, int64_t length) -> Tensor
+  variants: function, method
 
 - func: ones(IntList size, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: ones_out(Tensor result, IntList size) -> Tensor
-  variants: function
 
 - func: ones_like(Tensor self) -> Tensor
-  variants: function
 
 - func: ones_like(Tensor self, *, TensorOptions options) -> Tensor
-  variants: function
 
 - func: ones(Type dtype, IntList size) -> Tensor
-  variants: function
   deprecated: true
 
 - func: pairwise_distance(Tensor x1, Tensor x2, double p=2, double eps=1e-6, bool keepdim=false) -> Tensor
-  variants: function
 
 - func: pdist(Tensor self, double p=2) -> Tensor
-  variants: function
 
 - func: _pdist_forward(Tensor self, double p=2) -> Tensor
-  variants: function
 
 - func: _pdist_backward(Tensor grad, Tensor self, double p, Tensor pdist) -> Tensor
-  variants: function
 
 - func: permute(Tensor self, IntList dims) -> Tensor
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
 
 - func: pin_memory(Tensor self) -> Tensor
+  variants: function, method
 
 - func: pinverse(Tensor self, double rcond=1e-15) -> Tensor
+  variants: function, method
 
 - func: rand(IntList size, *, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: rand(IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: rand_out(Tensor result, IntList size, *) -> Tensor
-  variants: function
 
 - func: rand_out(Tensor result, IntList size, *, Generator* generator) -> Tensor
-  variants: function
 
 - func: rand_like(Tensor self) -> Tensor
-  variants: function
 
 - func: rand_like(Tensor self, *, TensorOptions options) -> Tensor
-  variants: function
 
 - func: rand(Type dtype, IntList size, *, Generator* generator=nullptr) -> Tensor
-  variants: function
   deprecated: true
 
 - func: randint(int64_t high, IntList size, *, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: randint(int64_t high, IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: randint(int64_t low, int64_t high, IntList size, *, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: randint(int64_t low, int64_t high, IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: randint(Type dtype, int64_t high, IntList size, *, Generator* generator=nullptr) -> Tensor
-  variants: function
   deprecated: true
 
 - func: randint(Type dtype, int64_t low, int64_t high, IntList size, *, Generator* generator=nullptr) -> Tensor
-  variants: function
   deprecated: true
 
 - func: randint_out(Tensor result, int64_t high, IntList size, *) -> Tensor
-  variants: function
 
 - func: randint_out(Tensor result, int64_t high, IntList size, *, Generator* generator) -> Tensor
-  variants: function
 
 - func: randint_out(Tensor result, int64_t low, int64_t high, IntList size, *) -> Tensor
-  variants: function
 
 - func: randint_out(Tensor result, int64_t low, int64_t high, IntList size, *, Generator* generator) -> Tensor
-  variants: function
 
 - func: randint_like(Tensor self, int64_t high) -> Tensor
-  variants: function
 
 - func: randint_like(Tensor self, int64_t low, int64_t high) -> Tensor
-  variants: function
 
 - func: randint_like(Tensor self, int64_t high, *, TensorOptions options) -> Tensor
-  variants: function
 
 - func: randint_like(Tensor self, int64_t low, int64_t high, *, TensorOptions options) -> Tensor
-  variants: function
 
 - func: randn(IntList size, *, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: randn(IntList size, *, Generator* generator, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: randn_out(Tensor result, IntList size, *) -> Tensor
-  variants: function
 
 - func: randn_out(Tensor result, IntList size, *, Generator* generator) -> Tensor
-  variants: function
 
 - func: randn_like(Tensor self) -> Tensor
-  variants: function
 
 - func: randn_like(Tensor self, *, TensorOptions options) -> Tensor
-  variants: function
 
 - func: randn(Type dtype, IntList size, *, Generator* generator=nullptr) -> Tensor
-  variants: function
   deprecated: true
 
 - func: randperm(int64_t n, *, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: randperm(int64_t n, *, Generator* generator, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: randperm_out(Tensor result, int64_t n, *) -> Tensor
-  variants: function
 
 - func: randperm_out(Tensor result, int64_t n, *, Generator* generator) -> Tensor
-  variants: function
   dispatch:
     CPU: randperm_out_cpu
     CUDA: randperm_out_cuda
 
 - func: randperm(Type dtype, int64_t n, *, Generator* generator=nullptr) -> Tensor
-  variants: function
   deprecated: true
 
 - func: range(Scalar start, Scalar end, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: range(Scalar start, Scalar end, Scalar step, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: range_out(Tensor result, Scalar start, Scalar end) -> Tensor
-  variants: function
 
 - func: range_out(Tensor result, Scalar start, Scalar end, Scalar step) -> Tensor
-  variants: function
 
 - func: range(Type dtype, Scalar start, Scalar end, Scalar step=1) -> Tensor
-  variants: function
   deprecated: true
 
 - func: repeat(Tensor self, IntList repeats) -> Tensor
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
 
 - func: reshape(Tensor self, IntList shape) -> Tensor
+  variants: function, method
 
 - func: reshape_as(Tensor self, Tensor other) -> Tensor
   variants: method
 
 - func: RoiPooling2d_forward(Tensor input, Tensor rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale) -> (Tensor, Tensor)
-  variants: function
   dispatch:
     CPU: RoiPooling2d_forward_cpu
     CUDA: RoiPooling2d_forward_cuda
 
 - func: RoiPooling2d_backward(Tensor input, Tensor rois, int64_t pooledHeight, int64_t pooledWidth, double spatialScale, Tensor gradOutput, Tensor argmaxes) -> Tensor
-  variants: function
   dispatch:
     CPU: RoiPooling2d_backward_cpu
     CUDA: RoiPooling2d_backward_cuda
 
 - func: round(Tensor self) -> Tensor
+  variants: function, method
 
 - func: round_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _round__cpu
     CUDA: _round__cuda
 
 - func: round_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _round_out_cpu
     CUDA: _round_out_cuda
 
 - func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) -> Tensor
-  variants: function
 
 - func: rrelu_(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) -> Tensor
-  variants: function
 
 - func: relu(Tensor self) -> Tensor
+  variants: function, method
 
 - func: relu_(Tensor self) -> Tensor
+  variants: function, method
 
 - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
+  variants: function, method
   dispatch:
     CPU: hardshrink_cpu
     CUDA: hardshrink_cuda
 
 - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
+  variants: function, method
   dispatch:
     CPU: hardshrink_backward_cpu
     CUDA: hardshrink_backward_cuda
 
 - func: rsqrt(Tensor self) -> Tensor
+  variants: function, method
 
 - func: rsqrt_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _rsqrt__cpu
     CUDA: _rsqrt__cuda
 
 - func: rsqrt_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _rsqrt_out_cpu
     CUDA: _rsqrt_out_cuda
 
 - func: select(Tensor self, int64_t dim, int64_t index) -> Tensor
+  variants: function, method
 
 - func: selu(Tensor self) -> Tensor
-  variants: function
 
 - func: selu_(Tensor self) -> Tensor
-  variants: function
 
 - func: celu(Tensor self, Scalar alpha=1.0) -> Tensor
-  variants: function
 
 - func: celu_(Tensor self, Scalar alpha=1.0) -> Tensor
-  variants: function
 
 - func: sigmoid(Tensor self) -> Tensor
+  variants: function, method
 
 - func: sigmoid_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _sigmoid__cpu
     CUDA: _sigmoid__cuda
 
 - func: sigmoid_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _sigmoid_out_cpu
     CUDA: _sigmoid_out_cuda
 
 - func: sin(Tensor self) -> Tensor
+  variants: function, method
 
 - func: sin_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _sin__cpu
     CUDA: _sin__cuda
 
 - func: sin_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _sin_out_cpu
     CUDA: _sin_out_cuda
 
 - func: sinh(Tensor self) -> Tensor
+  variants: function, method
 
 - func: sinh_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _sinh__cpu
     CUDA: _sinh__cuda
 
 - func: sinh_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _sinh_out_cpu
     CUDA: _sinh_out_cuda
 
 - func: detach(Tensor self) -> Tensor
+  variants: function, method
 
 - func: detach_(Tensor self) -> Tensor
+  variants: function, method
 
 - func: size(Tensor self, int64_t dim) -> int64_t
+  variants: function, method
   device_guard: false
 
 - func: slice(Tensor self, int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) -> Tensor
+  variants: function, method
 
 - func: slogdet(Tensor self) -> (Tensor, Tensor)
+  variants: function, method
 
 - func: smm(Tensor self, Tensor mat2) -> Tensor
+  variants: function, method
 
 - func: softmax(Tensor self, int64_t dim) -> Tensor
+  variants: function, method
   dispatch:
     CPU: softmax_cpu
     CUDA: softmax_cuda
 
 - func: softmax_backward_data(Tensor grad_output, Tensor output, int64_t dim, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: softmax_backward_cpu
     CUDA: softmax_backward_cuda
 
 - func: _sparse_add_out(Tensor result, Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: add_out_sparse_cpu
     SparseCUDA: add_out_sparse_cuda
 
 - func: _sparse_dense_add_out(Tensor result, Tensor self, SparseTensorRef other, *, Scalar alpha=1) -> Tensor
-  variants: function
   dispatch:
     CPU: add_out_dense_sparse_cpu
     CUDA: add_out_dense_sparse_cuda
 
 - func: _sparse_div_zerodim_out(Tensor result, Tensor self, Tensor other) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: div_out_sparse_zerodim
     SparseCUDA: div_out_sparse_zerodim
 
 - func: _sparse_div_scalar_out(Tensor result, Tensor self, Scalar other) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: div_out_sparse_scalar
     SparseCUDA: div_out_sparse_scalar
 
 - func: _sparse_mul_out(Tensor result, Tensor self, Tensor other) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: mul_out_sparse_cpu
     SparseCUDA: mul_out_sparse_cuda
 
 - func: _sparse_mul_zerodim_out(Tensor result, Tensor self, Tensor other) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: mul_out_sparse_zerodim
     SparseCUDA: mul_out_sparse_zerodim
 
 - func: _sparse_mul_scalar_out(Tensor result, Tensor self, Scalar other) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: mul_out_sparse_scalar
     SparseCUDA: mul_out_sparse_scalar
 
 - func: split(Tensor self, int64_t split_size, int64_t dim=0) -> TensorList
+  variants: function, method
 
 - func: split_with_sizes(Tensor self, IntList split_sizes, int64_t dim=0) -> TensorList
+  variants: function, method
 
 - func: squeeze(Tensor self) -> Tensor
+  variants: function, method
 
 - func: squeeze(Tensor self, int64_t dim) -> Tensor
+  variants: function, method
 
 - func: squeeze_(Tensor self) -> Tensor
   variants: method
@@ -1575,9 +1474,9 @@
   variants: method
 
 - func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function, method
 
 - func: sspaddmm_out(Tensor result, Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  variants: function
   dispatch:
     CPU: _sspaddmm_out_only_sparse
     CUDA: _sspaddmm_out_only_sparse_cuda
@@ -1585,138 +1484,148 @@
     SparseCUDA: _sspaddmm_out_cuda
 
 - func: stack(TensorList tensors, int64_t dim=0) -> Tensor
-  variants: function
 
 - func: stack_out(Tensor result, TensorList tensors, int64_t dim=0) -> Tensor
-  variants: function
 
 # The signature is designed to be consistent with librosa except that it is
 # missing the `pad_mode` and `center` arguments, which are taken care of at
 # `torch.functional.py`. They shall be moved here once we have mapping between
 # Python strings and C++ Enum in codegen.
 - func: stft(Tensor self, int64_t n_fft, int64_t hop_length, int64_t win_length, Tensor? window={}, bool normalized=false, bool onesided=true) -> Tensor
+  variants: function, method
   python_default_init:
     hop_length: n_fft >> 2
     win_length: n_fft
 
 - func: stride(Tensor self, int64_t dim) -> int64_t
+  variants: function, method
   device_guard: false
 
 # FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
 - func: sum(Tensor self, *, ScalarType dtype) -> Tensor
+  variants: function, method
 
 - func: sum(Tensor self) -> Tensor
+  variants: function, method
 
 - func: _sum(Tensor self) -> Tensor
+  variants: function
   dispatch:
     CPU: _sum_cpu
     CUDA: _sum_cuda
 
 - func: sum(Tensor self, IntList[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  variants: function, method
 
 - func: sum(Tensor self, IntList[1] dim, bool keepdim=False) -> Tensor
+  variants: function, method
 
 - func: sum(Tensor self, IntList[1] dim, *, ScalarType dtype) -> Tensor
+  variants: function, method
 
 - func: _sum(Tensor self, IntList[1] dim, bool keepdim=False) -> Tensor
+  variants: function
 
 - func: sum_out(Tensor result, Tensor self, IntList[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
-  variants: function
 
 - func: sum_out(Tensor result, Tensor self, IntList[1] dim, bool keepdim=False) -> Tensor
-  variants: function
 
 - func: sum_out(Tensor result, Tensor self, IntList[1] dim, *, ScalarType dtype) -> Tensor
-  variants: function
 
 - func: _sum_out(Tensor result, Tensor self, IntList[1] dim, bool keepdim=False) -> Tensor
-  variants: function
 
 - func: _sum_cuda_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor
-  variants: function
   dispatch:
     CUDA: _sum_out_cuda
 
 - func: sqrt(Tensor self) -> Tensor
+  variants: function, method
 
 - func: sqrt_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _sqrt__cpu
     CUDA: _sqrt__cuda
 
 - func: sqrt_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _sqrt_out_cpu
     CUDA: _sqrt_out_cuda
 
 - func: std(Tensor self, bool unbiased=true) -> Tensor
+  variants: function, method
 
 - func: std(Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor
+  variants: function, method
 
 - func: std_out(Tensor result, Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor
-  variants: function
 
 # FIXME: These could be combined as optional<ScalarType> but for https://github.com/pytorch/pytorch/issues/6593.
 - func: prod(Tensor self, *, ScalarType dtype) -> Tensor
+  variants: function, method
 
 - func: prod(Tensor self) -> Tensor
+  variants: function, method
 
 - func: _prod(Tensor self) -> Tensor
+  variants: function
   dispatch:
     CPU: _prod_cpu
     CUDA: _prod_cuda
 
 - func: prod(Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor
+  variants: function, method
 
 - func: prod(Tensor self, int64_t dim, bool keepdim=False) -> Tensor
+  variants: function, method
 
 - func: prod(Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
+  variants: function, method
 
 - func: _prod(Tensor self, int64_t dim, bool keepdim=False) -> Tensor
+  variants: function
 
 - func: prod_out(Tensor result, Tensor self, int64_t dim, bool keepdim, *, ScalarType dtype) -> Tensor
-  variants: function
 
 - func: prod_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor
-  variants: function
 
 - func: prod_out(Tensor result, Tensor self, int64_t dim, *, ScalarType dtype) -> Tensor
-  variants: function
 
 - func: _prod_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor
-  variants: function
   dispatch:
     CPU: _prod_out_cpu
     CUDA: _prod_out_cuda
 
 - func: t(Tensor self) -> Tensor
+  variants: function, method
 
 - func: t_(Tensor self) -> Tensor
   variants: method
 
 - func: tan(Tensor self) -> Tensor
+  variants: function, method
 
 - func: tan_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _tan__cpu
     CUDA: _tan__cuda
 
 - func: tan_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _tan_out_cpu
     CUDA: _tan_out_cuda
 
 - func: tanh(Tensor self) -> Tensor
+  variants: function, method
 
 - func: tanh_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _tanh__cpu
     CUDA: _tanh__cuda
 
 - func: tanh_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _tanh_out_cpu
     CUDA: _tanh_out_cuda
@@ -1725,33 +1634,35 @@
   variants: function
 
 - func: transpose(Tensor self, int64_t dim0, int64_t dim1) -> Tensor
+  variants: function, method
 
 - func: transpose_(Tensor self, int64_t dim0, int64_t dim1) -> Tensor
   variants: method
 
 - func: flip(Tensor self, IntList dims) -> Tensor
+  variants: function, method
   dispatch:
     CPU: flip_cpu
     CUDA: flip_cuda
 
 # default IntList value {0,1} should not add space after comma, since native_parse.py uses ', ' to split args
 - func: rot90(Tensor self, int64_t k=1, IntList dims={0,1}) -> Tensor
+  variants: function, method
 
 - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, IntList expand1, IntList expand2, IntList expand3, IntList sumdim, int64_t unroll_dim=1) -> Tensor
-  variants: function
 
 - func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, double margin=1.0, double p=2, double eps=1e-6, bool swap=false, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
-  variants: function
 
 - func: trunc(Tensor self) -> Tensor
+  variants: function, method
 
 - func: trunc_(Tensor self) -> Tensor
+  variants: function, method
   dispatch:
     CPU: _trunc__cpu
     CUDA: _trunc__cuda
 
 - func: trunc_out(Tensor result, Tensor self) -> Tensor
-  variants: function
   dispatch:
     CPU: _trunc_out_cpu
     CUDA: _trunc_out_cuda
@@ -1760,29 +1671,32 @@
   variants: method
 
 - func: _unique(Tensor self, bool sorted=false, bool return_inverse=false) -> (Tensor, Tensor)
+  variants: function
   dispatch:
     CPU: _unique_cpu
     CUDA: _unique_cuda
 
 - func: _unique_dim(Tensor self, int64_t dim, bool sorted=false, bool return_inverse=false) -> (Tensor, Tensor)
+  variants: function
   dispatch:
     CPU: _unique_dim_cpu
     CUDA: _unique_dim_cuda
 
 - func: _unsafe_view(Tensor self, IntList size) -> Tensor
-  variants: function
 
 - func: unsqueeze(Tensor self, int64_t dim) -> Tensor
+  variants: function, method
 
 - func: unsqueeze_(Tensor self, int64_t dim) -> Tensor
   variants: method
 
 - func: var(Tensor self, bool unbiased=true) -> Tensor
+  variants: function, method
 
 - func: var(Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor
+  variants: function, method
 
 - func: var_out(Tensor result, Tensor self, int64_t dim, bool unbiased=true, bool keepdim=false) -> Tensor
-  variants: function
 
 - func: view_as(Tensor self, Tensor other) -> Tensor
   variants: method
@@ -1791,39 +1705,38 @@
 # this allows us to implicitly calculate the broadcast derivative, while only dealing with the
 # _s_where derivative.
 - func: where(BoolTensor condition, Tensor self, Tensor other) -> Tensor
+  variants: function, method
+
 - func: _s_where(BoolTensor condition, Tensor self, Tensor other) -> Tensor
+  variants: function
   dispatch:
     CPU: _s_where_cpu
     CUDA: _s_where_cuda
 
 - func: zeros(IntList size, TensorOptions options={}) -> Tensor
-  variants: function
 
 - func: zeros_out(Tensor result, IntList size) -> Tensor
-  variants: function
 
 - func: zeros_like(Tensor self) -> Tensor
-  variants: function
 
 - func: zeros_like(Tensor self, *, TensorOptions options) -> Tensor
-  variants: function
 
 - func: zeros(Type dtype, IntList size) -> Tensor
-  variants: function
   deprecated: true
 
 - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
+  variants: function
   dispatch:
     CPU: _standard_gamma_grad_cpu
     CUDA: _standard_gamma_grad_cuda
 
 - func: _standard_gamma(Tensor self, Generator* generator=nullptr) -> Tensor
+  variants: function
   dispatch:
     CPU: _s_gamma_cpu
     CUDA: _s_gamma_cuda
 
 - func: poisson(Tensor self, Generator* generator=nullptr) -> Tensor
-  variants: function
   dispatch:
     CPU: _s_poisson_cpu
     CUDA: _s_poisson_cuda
@@ -1832,115 +1745,107 @@
 # complicated
 
 - func: native_norm(Tensor self, Scalar p=2) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: norm_sparse
     SparseCUDA: norm_sparse
 
 - func: norm(Tensor self, Scalar p=2) -> Tensor
-  variants: method, function
+  variants: function, method
 
 - func: norm(Tensor self, Scalar p, int64_t dim, bool keepdim=false) -> Tensor
+  variants: function, method
   python_default_init:
     p: 2
 
 - func: norm_out(Tensor result, Tensor self, Scalar p, int64_t dim, bool keepdim=false) -> Tensor
-  variants: function
   python_default_init:
     p: 2
 
 - func: native_clone(Tensor self) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: clone_sparse
     SparseCUDA: clone_sparse
 
 - func: clone(Tensor self) -> Tensor
+  variants: function, method
 
 - func: native_resize_as_(Tensor self, Tensor the_template) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: resize_as_sparse_
     SparseCUDA: resize_as_sparse_
 
 - func: resize_as_(Tensor self, Tensor the_template) -> Tensor
+  variants: function, method
 
 - func: native_pow_out(Tensor result, Tensor self, Scalar exponent) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: pow_out_sparse_scalar
     SparseCUDA: pow_out_sparse_scalar
 
 - func: native_pow(Tensor self, Scalar exponent) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: pow_sparse_scalar
     SparseCUDA: pow_sparse_scalar
 
 - func: pow_out(Tensor result, Tensor self, Scalar exponent) -> Tensor
-  variants: function
 
 - func: pow(Tensor self, Scalar exponent) -> Tensor
+  variants: function, method
   variants: method, function
 
 - func: native_zero_(Tensor self) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: zero_sparse_
     SparseCUDA: zero_sparse_
 
 - func: zero_(Tensor self) -> Tensor
+  variants: method, function
 
 - func: sub_out(Tensor result, Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-  variants: function
 
 - func: sub(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-  variants: method, function
+  variants: function, method
 
 - func: sub_(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   variants: method
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: sub(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+  variants: function, method
+
 - func: sub_(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
   variants: method
 
 - func: s_native_addmm_out(Tensor result, Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  variants: function
   dispatch:
     CPU: s_addmm_out_sparse_dense_cpu
     CUDA: s_addmm_out_sparse_dense_cuda
 
 - func: s_native_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  variants: function
   dispatch:
     CPU: s_addmm_sparse_dense_cpu
     CUDA: s_addmm_sparse_dense_cuda
 
 - func: s_native_addmm_(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  variants: function
   dispatch:
     CPU: s_addmm_sparse_dense_cpu_
     CUDA: s_addmm_sparse_dense_cuda_
 
 - func: addmm_out(Tensor result, Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  variants: function
 
 - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  variants: method, function
+  variants: function, method
 
 - func: addmm_(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   variants: method
 
 
 - func: native_tensor(Type self_ty) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: new_sparse
     SparseCUDA: new_sparse
 
 - func: native_tensor(Type self_ty, IntList size) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: new_with_size_sparse
     SparseCUDA: new_with_size_sparse
@@ -1998,8 +1903,6 @@
     SparseCUDA: new_with_tensor_and_size_unsafe_sparse
 
 - func: _sparse_coo_tensor_unsafe(IndexTensor indices, Tensor values, IntList size) -> Tensor
-  variants: function
-
 
 - func: sparse_resize_(Tensor self, IntList size, int64_t sparseDims, int64_t denseDims) -> Tensor
   variants: method
@@ -2014,7 +1917,7 @@
     SparseCUDA: sparse_resize_and_clear_
 
 
-- func: _sparse_mask(Tensor self, SparseTensorRef mask) -> Tensor
+- func: sparse_mask(Tensor self, SparseTensorRef mask) -> Tensor
   variants: method
   dispatch:
     CPU: sparse_mask_cpu
@@ -2037,7 +1940,7 @@
 
 # legacy method
 - func: _dimI(Tensor self) -> int64_t
-  variants: method
+  variants: function
   dispatch: _sparseDims_sparse
   device_guard: False
 
@@ -2051,7 +1954,7 @@
 
 # legacy method
 - func: _dimV(Tensor self) -> int64_t
-  variants: method
+  variants: function
   dispatch: _denseDims_sparse
   device_guard: False
 
@@ -2096,52 +1999,42 @@
 
 
 - func: hspmm_out(Tensor result, Tensor mat1, Tensor mat2) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: hspmm_out_sparse_cpu
     SparseCUDA: hspmm_out_sparse_cuda
 
 - func: hspmm(Tensor mat1, Tensor mat2) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: hspmm_sparse_cpu
     SparseCUDA: hspmm_sparse_cuda
 
 # This "raw copy" doesn't handle conversions NOR does it handle non-blocking.
 - func: raw_copy_sparse_(Tensor self, Tensor src) -> Tensor
-  variants: function
   dispatch:
     SparseCPU: copy_sparse_
     SparseCUDA: copy_sparse_
 
 - func: numel(Tensor self) -> int64_t
-  variants:
-    - method
-    - function
+  variants: function, method
   device_guard: False
 
 - func: unbind(Tensor self, int64_t dim=0) -> TensorList
-  variants:
-    - method
-    - function
+  variants: function, method
 
 - func: native_get_device(Tensor self) -> int64_t
-  variants: function
   dispatch:
     SparseCUDA: get_device_sparse_cuda
   device_guard: False
 
 - func: get_device(Tensor self) -> int64_t
+  variants: function, method
   device_guard: False
 
 - func: meshgrid(TensorList tensors) -> TensorList
-  variants: function
 
 # This has a method dispatch to work around circular include problems
 - func: _local_scalar(Tensor self) -> Scalar
-  variants:
-    - function
-    - method
+  variants: function, method
 
 # NB: Does NOT check precondition that numel == 1
 # WARNING: Use of cpu_half here is generally not supported; please
@@ -2151,73 +2044,53 @@
   dispatch:
     CPU: _local_scalar_dense_cpu
     CUDA: _local_scalar_dense_cuda
-  variants:
-    - function
+  variants: function
 
 # Fused RNN kernels
 - func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias={}, Tensor? hidden_bias={}) -> (Tensor, Tensor, Tensor)
   dispatch:
     CUDA: _thnn_fused_lstm_cell_cuda
-  variants: function
 
 - func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
   dispatch:
     CUDA: _thnn_fused_lstm_cell_backward_cuda
-  variants: function
 
 - func: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias={}, Tensor? hidden_bias={}) -> (Tensor, Tensor)
   dispatch:
     CUDA: _thnn_fused_gru_cell_cuda
-  variants: function
 
 - func: _thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
   dispatch:
     CUDA: _thnn_fused_gru_cell_backward_cuda
-  variants: function
 
 # RNN cells and layers
 - func: lstm(Tensor input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
-  variants: function
 
 - func: lstm(Tensor data, Tensor batch_sizes, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)
-  variants: function
 
 - func: gru(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
-  variants: function
 
 - func: gru(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
-  variants: function
 
 - func: rnn_tanh(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
-  variants: function
 
 - func: rnn_tanh(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
-  variants: function
 
 - func: rnn_relu(Tensor input, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
-  variants: function
 
 - func: rnn_relu(Tensor data, Tensor batch_sizes, Tensor hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
-  variants: function
 
 - func: lstm_cell(Tensor input, TensorList hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih={}, Tensor? b_hh={}) -> (Tensor, Tensor)
-  variants: function
 
 - func: gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih={}, Tensor? b_hh={}) -> Tensor
-  variants: function
 
 - func: rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih={}, Tensor? b_hh={}) -> Tensor
-  variants: function
 
 - func: rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih={}, Tensor? b_hh={}) -> Tensor
-  variants: function
 
 # PackedSequence utilities
 - func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)
-  variants: function
 
 - func: _pack_padded_sequence_backward(Tensor grad, IntList input_size, Tensor batch_sizes, bool batch_first) -> Tensor
-  variants: function
 
 - func: _pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int64_t total_length) -> (Tensor, Tensor)
-  variants: function
diff --git a/aten/src/ATen/native_parse.py b/aten/src/ATen/native_parse.py
index e90c9f52228e4..8d1dd01152a8c 100644
--- a/aten/src/ATen/native_parse.py
+++ b/aten/src/ATen/native_parse.py
@@ -46,7 +46,7 @@ def parse_arguments(args, func_decl, func_name, func_return):
     arguments = []
     python_default_inits = func_decl.get('python_default_init', {})
     is_out_fn = func_name.endswith('_out')
-    if is_out_fn and func_decl.get('variants', []) not in ['function', ['function']]:
+    if is_out_fn and func_decl.get('variants', []) not in [[], 'function', ['function']]:
         raise RuntimeError("Native functions suffixed with _out MUST be declared with only the function variant; "
                            "e.g., variants: function; otherwise you will tickle a Python argument binding bug "
                            "(which usually manifests itself as the result variable being undefined.) "
@@ -130,7 +130,7 @@ def run(paths):
                 arguments = parse_arguments(arguments, func, declaration['name'], return_type)
                 output_arguments = [x for x in arguments if x.get('output')]
                 declaration['return'] = return_type if len(output_arguments) == 0 else output_arguments
-                declaration['variants'] = func.get('variants', ['method', 'function'])
+                declaration['variants'] = func.get('variants', ['function'])
                 declaration['cpu_half'] = func.get('cpu_half', False)
                 declaration['deprecated'] = func.get('deprecated', False)
                 declaration['device_guard'] = func.get('device_guard', True)
diff --git a/aten/src/ATen/templates/NativeFunctions.h b/aten/src/ATen/templates/NativeFunctions.h
index 248f2e9c78848..82a2f00ff77bc 100644
--- a/aten/src/ATen/templates/NativeFunctions.h
+++ b/aten/src/ATen/templates/NativeFunctions.h
@@ -29,7 +29,7 @@ inline Tensor from_blob(
     IntList sizes,
     const std::function<void(void*)>& deleter,
     const TensorOptions& options = {}) {
-  return at::getMaybeVariableType(options).tensorFromBlob(data, sizes, deleter);
+  return at::getType(options).tensorFromBlob(data, sizes, deleter);
 }
 
 inline Tensor from_blob(
@@ -38,7 +38,7 @@ inline Tensor from_blob(
     IntList strides,
     const std::function<void(void*)>& deleter,
     const TensorOptions& options = {}) {
-  return at::getMaybeVariableType(options).tensorFromBlob(data, sizes, strides, deleter);
+  return at::getType(options).tensorFromBlob(data, sizes, strides, deleter);
 }
 
 inline Tensor from_blob(
diff --git a/aten/src/ATen/templates/RegisterCPU.cpp b/aten/src/ATen/templates/RegisterCPU.cpp
index 0c1eeb4818fbb..65b1d5e06cd27 100644
--- a/aten/src/ATen/templates/RegisterCPU.cpp
+++ b/aten/src/ATen/templates/RegisterCPU.cpp
@@ -5,7 +5,7 @@
 #include <ATen/Type.h>
 #include <ATen/Context.h>
 #include <ATen/UndefinedType.h>
-#include <ATen/detail/VariableHooksInterface.h>
+#include <ATen/core/VariableHooksInterface.h>
 
 ${cpu_type_headers}
 
@@ -13,8 +13,7 @@ namespace at {
 
 void register_cpu_types(Context * context) {
   ${cpu_type_registrations}
-  context->type_registry[static_cast<int>(Backend::Undefined)]
-                        [static_cast<int>(ScalarType::Undefined)].reset(new UndefinedType());
+  context->registerType(Backend::Undefined, ScalarType::Undefined, new UndefinedType());
 }
 
 } // namespace at
diff --git a/aten/src/ATen/templates/RegisterCUDA.cpp b/aten/src/ATen/templates/RegisterCUDA.cpp
index 40c00c1f400c5..b327b783e161b 100644
--- a/aten/src/ATen/templates/RegisterCUDA.cpp
+++ b/aten/src/ATen/templates/RegisterCUDA.cpp
@@ -4,7 +4,7 @@
 
 #include <ATen/Type.h>
 #include <ATen/Context.h>
-#include <ATen/detail/VariableHooksInterface.h>
+#include <ATen/core/VariableHooksInterface.h>
 
 ${cuda_type_headers}
 
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 967a123b983d9..77c18466404c8 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -157,10 +157,10 @@ struct AT_API Tensor {
   const Storage& storage() const {
     return tensor_impl_->storage();
   }
-  inline Tensor toType(const Type & t, bool non_blocking=false) const;
-  inline Tensor & copy_(const Tensor & src, bool non_blocking=false);
-  inline Tensor toType(ScalarType t) const;
-  inline Tensor toBackend(Backend b) const;
+  Tensor toType(const Type & t, bool non_blocking=false) const;
+  Tensor & copy_(const Tensor & src, bool non_blocking=false);
+  Tensor toType(ScalarType t) const;
+  Tensor toBackend(Backend b) const;
 
   /// New-style `to()` methods.
   /// NB: These methods are defined in TensorOptions.h.
@@ -194,12 +194,12 @@ struct AT_API Tensor {
   //toLongData(), toFloatData() etc.
   #define TO_TYPE_DATA(T,name,_) \
   T * to##name##Data() const;
-  AT_FORALL_SCALAR_TYPES(TO_TYPE_DATA)
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_TYPE_DATA)
   #undef TO_TYPE_DATA
 
   #define TO_C_TYPE(T,name,_) \
   T toC##name () const;
-  AT_FORALL_SCALAR_TYPES(TO_C_TYPE)
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(TO_C_TYPE)
   #undef TO_C_TYPE
 
   template<typename T, size_t N>
@@ -244,9 +244,7 @@ struct AT_API Tensor {
     return tensor_impl_->grad();
   }
 
-  void set_data(Tensor new_data) {
-    tensor_impl_->set_data(new_data);
-  }
+  void set_data(Tensor new_data);
 
   /// Computes the gradient of current tensor w.r.t. graph leaves.
   void backward(
diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h
index 4aaaf8203bb78..07e5f2b634372 100644
--- a/aten/src/ATen/templates/TensorMethods.h
+++ b/aten/src/ATen/templates/TensorMethods.h
@@ -83,6 +83,17 @@ inline Tensor Tensor::to(Device device, bool non_blocking) const {
   return detail::to(*this, options().device(device), non_blocking);
 }
 
+inline void Tensor::backward(
+    at::optional<Tensor> gradient,
+    bool keep_graph,
+    bool create_graph) {
+  type().backward(*this, std::move(gradient), keep_graph, create_graph);
+}
+
+inline void Tensor::set_data(Tensor new_data) {
+  type().set_data(*this, new_data);
+}
+
 // all static inline to allow for inlining of the non-dynamic part of dispatch
 ${tensor_method_definitions}
 
@@ -101,13 +112,13 @@ inline Tensor Tensor::to(Device device, bool non_blocking) const {
     return data<T>();                            \
   }
 
-AT_FORALL_SCALAR_TYPES(DEFINE_CAST)
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_CAST)
 #undef DEFINE_CAST
 
 #define DEFINE_TO_C_TYPE(T,name,_) \
 inline T Tensor::toC##name () const { return _local_scalar().to##name (); }
 
-AT_FORALL_SCALAR_TYPES(DEFINE_TO_C_TYPE)
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_TO_C_TYPE)
 #undef DEFINE_TO_C_TYPE
 
 } //namespace at
diff --git a/aten/src/ATen/templates/Type.h b/aten/src/ATen/templates/Type.h
index df1425f589da9..3a7080ea201e3 100644
--- a/aten/src/ATen/templates/Type.h
+++ b/aten/src/ATen/templates/Type.h
@@ -104,6 +104,9 @@ struct AT_API Type {
   virtual Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const = 0;
   virtual Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const = 0;
 
+  virtual void backward(Tensor & self, at::optional<Tensor> gradient, bool keep_graph, bool create_graph) const = 0;
+  virtual void set_data(Tensor & self, Tensor new_data) const = 0;
+
   virtual Tensor tensorFromBlob(void * data, IntList sizes, const std::function<void(void*)> & deleter=noop_deleter) const = 0;
   virtual Tensor tensorFromBlob(void * data, IntList sizes, IntList strides, const std::function<void(void*)> & deleter=noop_deleter) const = 0;
   virtual Tensor tensorWithAllocator(IntList sizes, Allocator* allocator) const = 0;
diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index 6fd208b5914b0..5e614edc57f21 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -2,7 +2,10 @@
 
 // ${generated_comment}
 
+#include "ATen/core/SparseTensorRef.h"
+#include "ATen/DeviceGuard.h"
 #include "ATen/ExpandUtils.h"
+#include "ATen/Functions.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/core/Scalar.h"
 #include "ATen/core/SparseTensorRef.h"
@@ -37,6 +40,14 @@ Tensor TypeDefault::copy(const Tensor & src, bool non_blocking) const {
   }
 }
 
+void TypeDefault::backward(Tensor & self, at::optional<Tensor> gradient, bool keep_graph, bool create_graph) const {
+  AT_ERROR("backward is not implemented for Tensor");
+}
+
+void TypeDefault::set_data(Tensor & self, Tensor new_data) const {
+  AT_ERROR("set_data is not implemented for Tensor");
+}
+
 Type & TypeDefault::toBackend(Backend b) const {
   return at::globalContext().getNonVariableType(b,scalarType());
 }
diff --git a/aten/src/ATen/templates/TypeDefault.h b/aten/src/ATen/templates/TypeDefault.h
index af9f37b704da6..64ec158f82349 100644
--- a/aten/src/ATen/templates/TypeDefault.h
+++ b/aten/src/ATen/templates/TypeDefault.h
@@ -28,6 +28,9 @@ struct AT_API TypeDefault : public Type {
   Tensor copy(const Tensor & src, bool non_blocking=false) const override;
   Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking=false) const override;
 
+  void backward(Tensor & self, at::optional<Tensor> gradient, bool keep_graph, bool create_graph) const override;
+  void set_data(Tensor & self, Tensor new_data) const override;
+
   Tensor tensorFromBlob(void * data, IntList sizes, const std::function<void(void*)> & deleter=noop_deleter) const override;
   Tensor tensorFromBlob(void * data, IntList sizes, IntList strides, const std::function<void(void*)> & deleter=noop_deleter) const override;
   Tensor tensorWithAllocator(IntList sizes, Allocator* allocator) const override;
diff --git a/aten/src/ATen/templates/TypeDerived.h b/aten/src/ATen/templates/TypeDerived.h
index be26dfc09a9b1..3a48d8b26e32b 100644
--- a/aten/src/ATen/templates/TypeDerived.h
+++ b/aten/src/ATen/templates/TypeDerived.h
@@ -4,7 +4,6 @@
 
 #include "ATen/CPUTypeDefault.h"
 #include "ATen/Context.h"
-#include "ATen/TensorMethods.h"
 #include "ATen/CheckGenerator.h"
 
 $extra_cuda_headers
diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp
index 473c9ff12b52e..e10de30ae8e02 100644
--- a/aten/src/ATen/test/native_test.cpp
+++ b/aten/src/ATen/test/native_test.cpp
@@ -147,18 +147,18 @@ void test(Type & T, Type & AccT) {
   SECTION( "_standard_gamma_grad" ) {
     // check empty
     auto empty = ones({0}, T);
-    REQUIRE_EQUAL(empty, empty._standard_gamma_grad(empty));
+    REQUIRE_EQUAL(empty, at::_standard_gamma_grad(empty, empty));
 
     // check scalar equals one element
     auto one_scalar = ones({}, T).mul(5);
     auto one_with_dim = ones({1}, T).mul(5);
-    REQUIRE_ALLCLOSE(one_scalar._standard_gamma_grad(one_scalar),
-		     one_with_dim._standard_gamma_grad(one_with_dim).sum());
+    REQUIRE_ALLCLOSE(at::_standard_gamma_grad(one_scalar, one_scalar),
+		     at::_standard_gamma_grad(one_with_dim, one_with_dim).sum());
 
     // check mixing types
     auto t1 = randn({3, 4}, T);
     auto t2 = randn({3, 4}, T).toType(kDouble);
-    REQUIRE_THROWS_WITH(t1._standard_gamma_grad(t2), Catch::StartsWith("expected scalar type"));
+    REQUIRE_THROWS_WITH(at::_standard_gamma_grad(t1, t2), Catch::StartsWith("expected scalar type"));
   }
 
   SECTION( "where" ) {
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index b6e40d89b8e5e..72ef4e4ad3cf4 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -88,7 +88,7 @@ TEST_CASE( "scalar test", "[]" ) {
   Tensor next_h = i2h.add(h2h);
   next_h = next_h.tanh();
 
-  REQUIRE_THROWS(Tensor{}._local_scalar());
+  REQUIRE_THROWS(at::_local_scalar(Tensor{}));
 
   test_overflow();
 
diff --git a/aten/src/ATen/test/stream_test.cpp b/aten/src/ATen/test/stream_test.cpp
index 18212f6e20a66..145c4f4c26127 100644
--- a/aten/src/ATen/test/stream_test.cpp
+++ b/aten/src/ATen/test/stream_test.cpp
@@ -3,6 +3,7 @@
 
 #include "ATen/cuda/CUDAContext.h"
 #include "ATen/cuda/CUDAGuard.h"
+#include "ATen/cuda/CUDAEvent.h"
 
 #include "cuda_runtime.h"
 
@@ -211,7 +212,6 @@ TEST_CASE("Streampool Round Robin") {
   REQUIRE(hasDuplicates);
 }
 
-// Note: to be expanded once CUDAEvent PR is accepted
 TEST_CASE("Multi-GPU") {
   if (at::cuda::getNumGPUs() < 2) return;
 
@@ -226,3 +226,44 @@ TEST_CASE("Multi-GPU") {
   at::DeviceGuard device_guard{1};
   REQUIRE(s1 == at::cuda::getCurrentCUDAStream());
 }
+
+TEST_CASE("CUDAEvent Syncs") {
+  const auto stream = at::cuda::createCUDAStream();
+  at::cuda::CUDAEvent event;
+
+  REQUIRE(!event.happened());
+
+  event.recordOnce(stream);
+
+  const auto wait_stream0 = at::cuda::createCUDAStream();
+  const auto wait_stream1 = at::cuda::createCUDAStream();
+
+  wait_stream0.synchronize_with(event);
+  wait_stream1.synchronize_with(event);
+
+  cudaStreamSynchronize(wait_stream0);
+  REQUIRE(event.happened());
+}
+
+TEST_CASE("Cross-Device Events") {
+  if (at::cuda::getNumGPUs() < 2) return;
+
+  const auto stream0 = at::cuda::createCUDAStream();
+  at::cuda::CUDAEvent event0;
+
+  at::cuda::set_device(1);
+  const auto stream1 = at::cuda::createCUDAStream();
+  at::cuda::CUDAEvent event1;
+
+  event0.record(stream0);
+  event1.record(stream1);
+  
+  event0 = std::move(event1);
+  
+  REQUIRE(event0.device() == 1);
+
+  stream0.synchronize_with(event0);
+  
+  cudaStreamSynchronize(stream0);
+  REQUIRE(event0.happened());
+}
diff --git a/aten/src/TH/generic/THLapack.cpp b/aten/src/TH/generic/THLapack.cpp
index 1fed395a73a6d..55376b51e917e 100644
--- a/aten/src/TH/generic/THLapack.cpp
+++ b/aten/src/TH/generic/THLapack.cpp
@@ -13,8 +13,8 @@ TH_EXTERNC void dsyev_(char *jobz, char *uplo, int *n, double *a, int *lda, doub
 TH_EXTERNC void ssyev_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *info);
 TH_EXTERNC void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, double *wr, double *wi, double* vl, int *ldvl, double *vr, int *ldvr, double *work, int *lwork, int *info);
 TH_EXTERNC void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, float *wr, float *wi, float* vl, int *ldvl, float *vr, int *ldvr, float *work, int *lwork, int *info);
-TH_EXTERNC void dgesvd_(char *jobu, char *jobvt, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *info);
-TH_EXTERNC void sgesvd_(char *jobu, char *jobvt, int *m, int *n, float *a, int *lda, float *s, float *u, int *ldu, float *vt, int *ldvt, float *work, int *lwork, int *info);
+TH_EXTERNC void dgesdd_(char *jobz, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *iwork, int *info);
+TH_EXTERNC void sgesdd_(char *jobz, int *m, int *n, float *a, int *lda, float *s, float *u, int *ldu, float *vt, int *ldvt, float *work, int *lwork, int *iwork, int *info);
 TH_EXTERNC void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info);
 TH_EXTERNC void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info);
 TH_EXTERNC void dgetrs_(char *trans, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
@@ -114,16 +114,16 @@ void THLapack_(geev)(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar
 
 /* Compute the singular value decomposition (SVD) of a real M-by-N matrix A,
 optionally computing the left and/or right singular vectors */
-void THLapack_(gesvd)(char jobu, char jobvt, int m, int n, scalar_t *a, int lda, scalar_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, int *info)
+void THLapack_(gesdd)(char jobz, int m, int n, scalar_t *a, int lda, scalar_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, int *iwork, int *info)
 {
 #ifdef USE_LAPACK
 #if defined(TH_REAL_IS_DOUBLE)
-  dgesvd_( &jobu,  &jobvt,  &m,  &n,  a,  &lda,  s,  u,  &ldu,  vt,  &ldvt,  work,  &lwork,  info);
+  dgesdd_( &jobz,  &m,  &n,  a,  &lda,  s,  u,  &ldu,  vt,  &ldvt,  work,  &lwork,  iwork,  info);
 #else
-  sgesvd_( &jobu,  &jobvt,  &m,  &n,  a,  &lda,  s,  u,  &ldu,  vt,  &ldvt,  work,  &lwork,  info);
+  sgesdd_( &jobz,  &m,  &n,  a,  &lda,  s,  u,  &ldu,  vt,  &ldvt,  work,  &lwork,  iwork,  info);
 #endif
 #else
-  THError("gesvd : Lapack library not found in compile time\n");
+  THError("gesdd : Lapack library not found in compile time\n");
 #endif
 }
 
diff --git a/aten/src/TH/generic/THLapack.h b/aten/src/TH/generic/THLapack.h
index 284a0c4769369..a502c2bf95243 100644
--- a/aten/src/TH/generic/THLapack.h
+++ b/aten/src/TH/generic/THLapack.h
@@ -13,7 +13,7 @@ TH_API void THLapack_(syev)(char jobz, char uplo, int n, scalar_t *a, int lda, s
 /* Non-sym eigenvals */
 TH_API void THLapack_(geev)(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar_t *wr, scalar_t *wi, scalar_t* vl, int ldvl, scalar_t *vr, int ldvr, scalar_t *work, int lwork, int *info);
 /* svd */
-TH_API void THLapack_(gesvd)(char jobu, char jobvt, int m, int n, scalar_t *a, int lda, scalar_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, int *info);
+TH_API void THLapack_(gesdd)(char jobz, int m, int n, scalar_t *a, int lda, scalar_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, int *iwork, int *info);
 /* LU decomposition */
 TH_API void THLapack_(getrf)(int m, int n, scalar_t *a, int lda, int *ipiv, int *info);
 TH_API void THLapack_(getrs)(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info);
diff --git a/aten/src/TH/generic/THTensorEvenMoreMath.cpp b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
index f301d9808df3d..7ddee0d42b856 100644
--- a/aten/src/TH/generic/THTensorEvenMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
@@ -29,17 +29,30 @@ void THTensor_(zero)(THTensor *r_)
 
 void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, scalar_t value)
 {
+#ifdef _OPENMP
+  if (!omp_in_parallel()) {
+    int64_t tensor_size = THTensor_(nElement)(tensor);
+    int tensor_contig = THTensor_(isContiguous)(tensor);
+    int mask_contig = THTensor_(isContiguous)(mask);
+    TH_TENSOR_APPLY2_OMP(tensor_size, tensor_contig, mask_contig,
+      scalar_t, tensor, unsigned char, mask,
+      if (*mask_data > 1) {
+        THError("Mask tensor can take 0 and 1 values only");
+      } else if (*mask_data == 1) {
+        *tensor_data = value;
+      },
+      TH_OMP_OVERHEAD_THRESHOLD);
+    return;
+  }
+#endif
   TH_TENSOR_APPLY2(scalar_t, tensor, unsigned char, mask,
-                   if (*mask_data > 1)
-                   {
-                     THFree(mask_counter);
-                     THFree(tensor_counter);
-                     THError("Mask tensor can take 0 and 1 values only");
-                   }
-                   else if (*mask_data == 1)
-                   {
-                     *tensor_data = value;
-                   });
+    if (*mask_data > 1) {
+      THFree(mask_counter);
+      THFree(tensor_counter);
+      THError("Mask tensor can take 0 and 1 values only");
+    } else if (*mask_data == 1) {
+      *tensor_data = value;
+    });
 }
 
 void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src )
diff --git a/aten/src/TH/generic/THTensorLapack.cpp b/aten/src/TH/generic/THTensorLapack.cpp
index 7269a5081ca13..3785f92d3cfcc 100644
--- a/aten/src/TH/generic/THTensorLapack.cpp
+++ b/aten/src/TH/generic/THTensorLapack.cpp
@@ -405,14 +405,14 @@ void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz
   c10::raw::intrusive_ptr::decref(work);
 }
 
-void THTensor_(gesvd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char* jobu)
+void THTensor_(gesdd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char* jobz)
 {
   THTensor *ra_ = THTensor_(new)();
-  THTensor_(gesvd2)(ru_, rs_, rv_,  ra_, a, jobu);
+  THTensor_(gesdd2)(ru_, rs_, rv_,  ra_, a, jobz);
   c10::raw::intrusive_ptr::decref(ra_);
 }
 
-void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char* jobu)
+void THTensor_(gesdd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char* jobz)
 {
   if (a == NULL) a = ra_;
   THArgCheck(a->dim() == 2, 1, "A should be 2 dimensional");
@@ -422,6 +422,7 @@ void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
   THTensor *work;
   THTensor *rvf_ = THTensor_(new)();
   scalar_t wkopt;
+  THIntTensor *iwork;
 
   THTensor *ra__ = NULL;
   THTensor *ru__ = NULL;
@@ -438,9 +439,11 @@ void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
   ldu = m;
   ldvt = n;
 
+  iwork = k ? THIntTensor_newWithSize1d((int64_t)(8 * m)) : THIntTensor_newWithSize1d((int64_t)(8 * n));
+
   THTensor_(resize1d)(rs_,k);
   THTensor_(resize2d)(rvf_,ldvt,n);
-  if (*jobu == 'A')
+  if (*jobz == 'A')
     THTensor_(resize2d)(ru_,m,ldu);
   else
     THTensor_(resize2d)(ru_,k,ldu);
@@ -452,22 +455,22 @@ void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
   rs__ = THTensor_(newContiguous)(rs_);
   rv__ = THTensor_(newContiguous)(rvf_);
 
-  THLapack_(gesvd)(jobu[0],jobu[0],
+  THLapack_(gesdd)(jobz[0],
 		   m,n,ra__->data<scalar_t>(),lda,
 		   rs__->data<scalar_t>(),
 		   ru__->data<scalar_t>(),
 		   ldu,
 		   rv__->data<scalar_t>(), ldvt,
-		   &wkopt, -1, &info);
+		   &wkopt, -1, THIntTensor_data(iwork), &info);
   lwork = (int)wkopt;
   work = THTensor_(newWithSize1d)(lwork);
-  THLapack_(gesvd)(jobu[0],jobu[0],
+  THLapack_(gesdd)(jobz[0],
 		   m,n,ra__->data<scalar_t>(),lda,
 		   rs__->data<scalar_t>(),
 		   ru__->data<scalar_t>(),
 		   ldu,
 		   rv__->data<scalar_t>(), ldvt,
-		   work->data<scalar_t>(),lwork, &info);
+		   work->data<scalar_t>(),lwork, THIntTensor_data(iwork), &info);
 
   THLapackCheckWithCleanup("Lapack Error %s : %d superdiagonals failed to converge.",
                            THCleanup(
@@ -475,10 +478,11 @@ void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
                                c10::raw::intrusive_ptr::decref(rs__);
                                c10::raw::intrusive_ptr::decref(rv__);
                                c10::raw::intrusive_ptr::decref(ra__);
-                               c10::raw::intrusive_ptr::decref(work);),
-                           "gesvd", info, "");
+                               c10::raw::intrusive_ptr::decref(work);
+                               c10::raw::intrusive_ptr::decref(iwork);),
+                           "gesdd", info, "");
 
-  if (*jobu == 'S')
+  if (*jobz == 'S')
     THTensor_(narrow)(rv__,NULL,1,0,k);
 
   THTensor_(freeCopyTo)(ru__, ru_);
@@ -486,10 +490,11 @@ void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
   THTensor_(freeCopyTo)(rv__, rvf_);
   THTensor_(freeCopyTo)(ra__, ra_);
   c10::raw::intrusive_ptr::decref(work);
+  c10::raw::intrusive_ptr::decref(iwork);
 
-  if (*jobu == 'S') {
+  if (*jobz == 'S')
     THTensor_(narrow)(rvf_,NULL,1,0,k);
-  }
+
   THTensor_(resizeAs)(rv_, rvf_);
   THTensor_(copy)(rv_, rvf_);
   c10::raw::intrusive_ptr::decref(rvf_);
diff --git a/aten/src/TH/generic/THTensorLapack.h b/aten/src/TH/generic/THTensorLapack.h
index 3c2fdf97d6018..391dbeb79c487 100644
--- a/aten/src/TH/generic/THTensorLapack.h
+++ b/aten/src/TH/generic/THTensorLapack.h
@@ -7,8 +7,8 @@ TH_API void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b_, THTenso
 TH_API void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b_, THTensor *a_);
 TH_API void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobz, const char *uplo);
 TH_API void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr);
-TH_API void THTensor_(gesvd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char *jobu);
-TH_API void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char *jobu);
+TH_API void THTensor_(gesdd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char *jobz);
+TH_API void THTensor_(gesdd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char *jobz);
 TH_API void THTensor_(getri)(THTensor *ra_, THTensor *a);
 TH_API void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo);
 TH_API void THTensor_(potrs)(THTensor *rb_, THTensor *b_, THTensor *a_,  const char *uplo);
diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu
index 9d53e2b8efe6f..ecf39d9a1bf0f 100644
--- a/aten/src/THC/generic/THCTensorMathMagma.cu
+++ b/aten/src/THC/generic/THCTensorMathMagma.cu
@@ -321,18 +321,18 @@ THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, T
 #endif
 }
 
-THC_API void THCTensor_(gesvd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu)
+THC_API void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu)
 {
 #ifdef USE_MAGMA
   THCTensor *ra_ = THCTensor_(new)(state);
-  THCTensor_(gesvd2)(state, ru_, rs_, rv_,  ra_, a, jobu);
+  THCTensor_(gesdd2)(state, ru_, rs_, rv_,  ra_, a, jobu);
   THCTensor_(free)(state, ra_);
 #else
-  THError(NoMagma(gesvd));
+  THError(NoMagma(gesdd));
 #endif
 }
 
-THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus)
+THC_API void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobus)
 {
 #ifdef USE_MAGMA
   THArgCheck(!a->is_empty() && a->dim() == 2, 2, "A should be non-empty 2 dimensional");
@@ -392,7 +392,7 @@ THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_,
   magma_free_pinned(rs_data);
   magma_free_pinned(a_data);
 #else
-  THError(NoMagma(gesvd2));
+  THError(NoMagma(gesdd2));
 #endif
 }
 
diff --git a/aten/src/THC/generic/THCTensorMathMagma.h b/aten/src/THC/generic/THCTensorMathMagma.h
index 1462af4ddaadb..2cd9b2f5e705c 100644
--- a/aten/src/THC/generic/THCTensorMathMagma.h
+++ b/aten/src/THC/generic/THCTensorMathMagma.h
@@ -11,8 +11,8 @@ THC_API void THCTensor_(trtrs)(THCState *state, THCTensor *rb_, THCTensor *ra_,
 THC_API void THCTensor_(gels)(THCState *state, THCTensor *rb_, THCTensor *ra_, THCTensor *b_, THCTensor *a_);
 THC_API void THCTensor_(syev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobz, const char *uplo);
 THC_API void THCTensor_(geev)(THCState *state, THCTensor *re_, THCTensor *rv_, THCTensor *a_, const char *jobvr);
-THC_API void THCTensor_(gesvd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu);
-THC_API void THCTensor_(gesvd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobu);
+THC_API void THCTensor_(gesdd)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *a, const char *jobu);
+THC_API void THCTensor_(gesdd2)(THCState *state, THCTensor *ru_, THCTensor *rs_, THCTensor *rv_, THCTensor *ra_, THCTensor *a, const char *jobu);
 THC_API void THCTensor_(getri)(THCState *state, THCTensor *ra_, THCTensor *a);
 THC_API void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo);
 THC_API void THCTensor_(potrf)(THCState *state, THCTensor *ra_, THCTensor *a, const char *uplo);
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 2a338b85dff4d..32183e212c923 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -425,25 +425,12 @@ if (BUILD_TEST)
       endif()
     endforeach()
   endif()
-endif()
-
-set(__aten_test_dir "test/aten")
-if (NOT USE_ROCM)
-  foreach(test_src ${ATen_CPU_TEST_SRCS})
-    get_filename_component(test_name ${test_src} NAME_WE)
-    add_executable(${test_name} "${test_src}")
-    target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
-    target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
-    target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE})
-    target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS})
-    add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
-    install(TARGETS ${test_name} DESTINATION ${__aten_test_dir})
-  endforeach()
 
-  if(USE_CUDA OR USE_ROCM)
-    foreach(test_src ${ATen_CUDA_TEST_SRCS})
+  if (NOT USE_ROCM)
+    set(__aten_test_dir "test/aten")
+    foreach(test_src ${ATen_CPU_TEST_SRCS})
       get_filename_component(test_name ${test_src} NAME_WE)
-      torch_cuda_based_add_executable(${test_name} "${test_src}")
+      add_executable(${test_name} "${test_src}")
       target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
       target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
       target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE})
@@ -451,6 +438,19 @@ if (NOT USE_ROCM)
       add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
       install(TARGETS ${test_name} DESTINATION ${__aten_test_dir})
     endforeach()
+
+    if(USE_CUDA OR USE_ROCM)
+      foreach(test_src ${ATen_CUDA_TEST_SRCS})
+        get_filename_component(test_name ${test_src} NAME_WE)
+        torch_cuda_based_add_executable(${test_name} "${test_src}")
+        target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
+        target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
+        target_include_directories(${test_name} SYSTEM PRIVATE ${Caffe2_DEPENDENCY_INCLUDE})
+        target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS})
+        add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
+        install(TARGETS ${test_name} DESTINATION ${__aten_test_dir})
+      endforeach()
+    endif()
   endif()
 endif()
 
diff --git a/caffe2/core/blob.h b/caffe2/core/blob.h
index d54fa5aa98197..f085ee23995bd 100644
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@@ -128,6 +128,9 @@ class CAFFE2_API Blob {
         std::is_default_constructible<T>::value,
         "GetMutable can't be called with non-default-constructible types. "
         "Try using specialized methods");
+    static_assert(
+        !std::is_same<T, Tensor>::value,
+        "Use GetMutableTensor(DeviceType) instead");
     if (IsType<T>()) {
       return static_cast<T*>(pointer_);
     } else {
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index b12dc13e88657..4faaea93c6da1 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -205,6 +205,11 @@ class CAFFE2_API CPUStaticContext : public BaseStaticContext {
     return CPU;
   }
 
+  void ExtractDeviceOption(DeviceOption* device, const void* /*data*/)
+      override {
+    device->set_device_type(TypeToProto(GetDeviceType()));
+  }
+
  protected:
   static MemoryAllocationReporter reporter_;
 
diff --git a/caffe2/core/context_base.h b/caffe2/core/context_base.h
index 196bcb30ec79f..6f3eca1b6fb51 100644
--- a/caffe2/core/context_base.h
+++ b/caffe2/core/context_base.h
@@ -1,173 +1,14 @@
 #pragma once
 
-#include <array>
-#include <cstdlib>
-#include <ctime>
-#include <memory>
-#include <unordered_map>
-
-#include "caffe2/core/allocator.h"
-#include "caffe2/core/event.h"
+#include <ATen/core/context_base.h>
+// For CaffeMap
+#include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
-#include "caffe2/core/typeid.h"
 #include "caffe2/proto/caffe2_pb.h"
 
 namespace caffe2 {
-
-class BaseContext;
-
-/* BaseStaticContext defines the interface for static context, which contains
-   functions that are invoked statically before in Tensor class, e.g. New,
-   We will merge this with Allocator later.
- */
-class CAFFE2_API BaseStaticContext {
- public:
-  virtual ~BaseStaticContext() noexcept {}
-
-  virtual std::pair<void*, MemoryDeleter> New(size_t nbytes) const = 0;
-
-  virtual std::unique_ptr<BaseContext> CreateContext() = 0;
-
-  virtual std::unique_ptr<BaseContext> CreateContext(const DeviceOption&) = 0;
-
-  virtual DeviceType GetDeviceType() = 0;
-
-  /*
-   * @brief: Sets the DeviceOption for argument `device` based on the
-   * current context and the a data pointer
-   */
-  virtual void ExtractDeviceOption(DeviceOption* device, const void* /*data*/) {
-    device->set_device_type(TypeToProto(GetDeviceType()));
-  }
-};
-
-/**
- * Virtual interface for the Context class in Caffe2.
- *
- * A Context defines all the necessities to run an operator on a specific
- * device. Specific Context classes needs to implement all the pure virtual
- * functions in the BaseContext class.
- * TODO: add docs after this is finalized.
- */
-class CAFFE2_API BaseContext {
- public:
-  virtual ~BaseContext() noexcept {}
-
-  virtual BaseStaticContext* GetStaticContext() const = 0;
-
-  /* Sorry for the naming, will get rid of this in future diff */
-  virtual DeviceType GetDevicetype() const = 0;
-
-  virtual void SwitchToDevice(int /*stream_id*/) = 0;
-
-  inline void SwitchToDevice() {
-    SwitchToDevice(0);
-  }
-
-  virtual void WaitEvent(const Event& ev) = 0;
-
-  virtual void Record(Event* ev, const char* err_msg = nullptr) const = 0;
-
-  virtual void FinishDeviceComputation() = 0;
-
-  // This used to be arbitrary cross-device copy, but it turns out everyone
-  // did direct CPU-X copy, so we just make three functions for it (to avoid
-  // double dispatch).  This will get obsoleted by C10. where copies
-  // will be proper operators (and get to rely on multiple dispatch there.)
-  virtual void
-  CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) = 0;
-
-  virtual void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) = 0;
-
-  virtual void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) = 0;
-
-  virtual void CopyBytesToDevice(
-      size_t nbytes,
-      const void* src,
-      void* dst,
-      DeviceType type) {
-    if (type == CPU) {
-      CopyBytesToCPU(nbytes, src, dst);
-    } else if (type == GetDevicetype()) {
-      CopyBytesSameDevice(nbytes, src, dst);
-    } else {
-      CAFFE_THROW(
-          "CopyBytesToDevice can only copy to CPU or between same "
-          "device. Can't copy from: ",
-          GetDevicetype(),
-          " to",
-          type);
-    }
-  }
-
-  template <typename T>
-  inline void CopySameDevice(size_t n, const T* src, T* dst) {
-    static_assert(
-        std::is_fundamental<T>::value,
-        "CopySameDevice requires fundamental types");
-    CopyBytesSameDevice(
-        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
-  }
-
-  template <typename T>
-  inline void CopyFromCPU(size_t n, const T* src, T* dst) {
-    static_assert(
-        std::is_fundamental<T>::value,
-        "CopyFromCPU requires fundamental types");
-    CopyBytesFromCPU(
-        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
-  }
-
-  template <typename T>
-  inline void CopyToCPU(size_t n, const T* src, T* dst) {
-    static_assert(
-        std::is_fundamental<T>::value, "CopyToCPU requires fundamental types");
-    CopyBytesToCPU(
-        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
-  }
-
-  virtual bool SupportsNonFundamentalTypes() const {
-    return false;
-  }
-
-  inline void EnforceMetaCopyOK() {
-    CAFFE_ENFORCE(
-        SupportsNonFundamentalTypes(), "Context requires fundamental types");
-  }
-
-  inline void CopyItemsSameDevice(
-      const TypeMeta& meta,
-      size_t n,
-      const void* src,
-      void* dst) {
-    if (meta.copy()) {
-      EnforceMetaCopyOK();
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytesSameDevice(n * meta.itemsize(), src, dst);
-    }
-  }
-
-  inline void
-  CopyItemsFromCPU(const TypeMeta& meta, size_t n, const void* src, void* dst) {
-    if (meta.copy()) {
-      EnforceMetaCopyOK();
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytesFromCPU(n * meta.itemsize(), src, dst);
-    }
-  }
-
-  inline void
-  CopyItemsToCPU(const TypeMeta& meta, size_t n, const void* src, void* dst) {
-    if (meta.copy()) {
-      EnforceMetaCopyOK();
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytesToCPU(n * meta.itemsize(), src, dst);
-    }
-  }
-};
+using at::BaseContext;
+using at::BaseStaticContext;
 
 using StaticContextMap = CaffeMap<DeviceType, BaseStaticContext*>;
 CAFFE2_API StaticContextMap& GetStaticContexts();
diff --git a/caffe2/core/db.h b/caffe2/core/db.h
index 33e674d4a9259..06b74d11bd585 100644
--- a/caffe2/core/db.h
+++ b/caffe2/core/db.h
@@ -273,7 +273,7 @@ class CAFFE2_API DBReader {
     for (uint32_t s = 0; s < shard_id_; s++) {
       cursor_->Next();
       CAFFE_ENFORCE(
-          cursor_->Valid(), "Db has less rows than shard id: ", s, shard_id_);
+          cursor_->Valid(), "Db has fewer rows than shard id: ", s, shard_id_);
     }
   }
 
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
index d84e45b1facb8..c8657728c57e7 100644
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -190,6 +190,11 @@ class IDEEPStaticContext : public BaseStaticContext {
   DeviceType GetDeviceType() override {
     return IDEEP;
   }
+
+  void ExtractDeviceOption(DeviceOption* device, const void* /*data*/)
+      override {
+    device->set_device_type(TypeToProto(GetDeviceType()));
+  }
 };
 
 } // namespace caffe2
diff --git a/caffe2/mkl/utils/mkl_context.h b/caffe2/mkl/utils/mkl_context.h
index 18c492694db99..636ebf2217eac 100644
--- a/caffe2/mkl/utils/mkl_context.h
+++ b/caffe2/mkl/utils/mkl_context.h
@@ -167,6 +167,11 @@ class MKLStaticContext : public BaseStaticContext {
   DeviceType GetDeviceType() override {
     return MKLDNN;
   }
+
+  void ExtractDeviceOption(DeviceOption* device, const void* /*data*/)
+      override {
+    device->set_device_type(TypeToProto(GetDeviceType()));
+  }
 };
 
 } // namespace caffe2
diff --git a/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc b/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc
index 3115e9e30cb6e..4ec3ed1021719 100644
--- a/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc
+++ b/caffe2/operators/hip/spatial_batch_norm_op_miopen.cc
@@ -118,7 +118,10 @@ bool MIOpenSpatialBNOp::DoRunWithType() {
 
   // Only 2D BatchNorm is supported in MIopen for now
   // @petrex will follow up on adding 1D and 3D support
-  CAFFE_ENFORCE_EQ(X.ndim(), 4, "Only 2D input is supported in MIOpen BatchNormalization right now.");
+  CAFFE_ENFORCE_EQ(
+      X.ndim(),
+      4,
+      "Only 2D input is supported in MIOpen BatchNormalization right now.");
   const int N = X.dim32(0);
   const int C = X.dim32(1);
   const int H = X.dim32(2);
@@ -298,13 +301,13 @@ bool MIOpenSpatialBNGradientOp::DoRunWithType() {
   const void* saved_mean_data = saved_mean.template data<BNParamType>();
   const void* saved_var_data = saved_var.template data<BNParamType>();
 
-if (N == 0) {
+  if (N == 0) {
     // set gradients to zeros
     math::Set<BNParamType, HIPContext>(C, 0, dScale_data, &context_);
     math::Set<BNParamType, HIPContext>(C, 0, dBias_data, &context_);
     return true;
   }
-  
+
   MIOPEN_ENFORCE(miopenBatchNormalizationBackward(
       miopen_wrapper_.inline_miopen_handle(),
       mode_,
@@ -336,7 +339,6 @@ bool MIOpenSpatialBNGradientOp::RunOnDevice() {
   return true;
 }
 
-
 REGISTER_MIOPEN_OPERATOR(SpatialBN, MIOpenSpatialBNOp);
 REGISTER_MIOPEN_OPERATOR(SpatialBNGradient, MIOpenSpatialBNGradientOp);
 } // namespace caffe2
diff --git a/caffe2/operators/partition_ops.h b/caffe2/operators/partition_ops.h
index 35cf83811fecc..89762e3871603 100644
--- a/caffe2/operators/partition_ops.h
+++ b/caffe2/operators/partition_ops.h
@@ -248,6 +248,21 @@ class LengthsPartitionOp : public PartitionOpBase {
         1,
         "Only 1-D tensors supported as a partitioning tensor for sharding");
 
+    if (partitions == 1) {
+      // Specialization when partitions == 1 which just becomes a copy.
+      for (int i = 0; i < InputSize(); ++i) {
+        auto& input = Input(i);
+        auto& output = *Output(i);
+        output.ResizeLike(input);
+        context_.CopyItemsSameDevice(
+            input.meta(),
+            input.size(),
+            input.raw_data(),
+            output.raw_mutable_data(input.meta()));
+      }
+      return true;
+    }
+
     // Apply sharding to all parameters except lengths
     ApplyPartition<Index>(true /* skipFirstArgument */);
 
diff --git a/caffe2/operators/spatial_batch_norm_op.cc b/caffe2/operators/spatial_batch_norm_op.cc
index dc064cd951acf..f1b369891613a 100644
--- a/caffe2/operators/spatial_batch_norm_op.cc
+++ b/caffe2/operators/spatial_batch_norm_op.cc
@@ -6,69 +6,6 @@
 
 namespace caffe2 {
 
-template <>
-template <typename T>
-void SpatialBNOp<CPUContext>::ComputeFusedParam(
-    const int C,
-    const T* scale,
-    const T* bias,
-    const T* mean,
-    const T* var,
-    T* alpha,
-    T* beta) {
-  EigenVectorArrayMap<T> alpha_arr(alpha, C);
-  EigenVectorArrayMap<T> beta_arr(beta, C);
-  alpha_arr = ConstEigenVectorArrayMap<T>(scale, C) *
-      (ConstEigenVectorArrayMap<T>(var, C) + static_cast<T>(epsilon_)).rsqrt();
-  beta_arr = ConstEigenVectorArrayMap<T>(bias, C) -
-      alpha_arr * ConstEigenVectorArrayMap<T>(mean, C);
-}
-
-template <>
-template <typename T>
-void SpatialBNOp<CPUContext>::ComputeBatchMoments(
-    const int N,
-    const int C,
-    const int HxW,
-    const T* batch_mean_sum,
-    const T* batch_var_sum,
-    T* mean,
-    T* var) {
-  const T scale = T(1) / static_cast<T>(num_batches_ * N * HxW);
-  EigenVectorArrayMap<T> mean_arr(mean, C);
-  EigenVectorArrayMap<T> var_arr(var, C);
-  mean_arr = ConstEigenVectorArrayMap<T>(batch_mean_sum, C) * scale;
-  var_arr =
-      ConstEigenVectorArrayMap<T>(batch_var_sum, C) * scale - mean_arr.square();
-}
-
-template <>
-template <typename T>
-void SpatialBNOp<CPUContext>::ComputeRunningMomentsAndFusedParam(
-    const int C,
-    const T* scale,
-    const T* bias,
-    const T* mean,
-    const T* var,
-    T* running_mean,
-    T* running_var,
-    T* rstd,
-    T* alpha,
-    T* beta) {
-  const T a = T(1) - static_cast<T>(momentum_);
-  const T b = static_cast<T>(momentum_);
-  math::Axpby<T, T, CPUContext>(C, a, mean, b, running_mean, &context_);
-  math::Axpby<T, T, CPUContext>(C, a, var, b, running_var, &context_);
-  math::InvStd<T, CPUContext>(
-      C, static_cast<T>(epsilon_), var, rstd, &context_);
-  EigenVectorArrayMap<T> alpha_arr(alpha, C);
-  EigenVectorArrayMap<T> beta_arr(beta, C);
-  alpha_arr = ConstEigenVectorArrayMap<T>(scale, C) *
-      ConstEigenVectorArrayMap<T>(rstd, C);
-  beta_arr = ConstEigenVectorArrayMap<T>(bias, C) -
-      alpha_arr * ConstEigenVectorArrayMap<T>(mean, C);
-}
-
 namespace {
 
 OpSchema::Cost CostInferenceForSpatialBN(
diff --git a/caffe2/operators/spatial_batch_norm_op.h b/caffe2/operators/spatial_batch_norm_op.h
index 754ac50f3725f..cb81ad56ade5d 100644
--- a/caffe2/operators/spatial_batch_norm_op.h
+++ b/caffe2/operators/spatial_batch_norm_op.h
@@ -9,6 +9,7 @@
 
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
+#include "caffe2/utils/eigen_utils.h"
 #include "caffe2/utils/math.h"
 
 namespace caffe2 {
@@ -197,7 +198,15 @@ class SpatialBNOp : public Operator<Context> {
       const T* mean,
       const T* var,
       T* alpha,
-      T* beta);
+      T* beta) {
+    EigenVectorArrayMap<T> alpha_arr(alpha, C);
+    EigenVectorArrayMap<T> beta_arr(beta, C);
+    alpha_arr = ConstEigenVectorArrayMap<T>(scale, C) *
+        (ConstEigenVectorArrayMap<T>(var, C) + static_cast<T>(epsilon_))
+            .rsqrt();
+    beta_arr = ConstEigenVectorArrayMap<T>(bias, C) -
+        alpha_arr * ConstEigenVectorArrayMap<T>(mean, C);
+  }
 
   template <typename T>
   void ComputeBatchMoments(
@@ -207,7 +216,14 @@ class SpatialBNOp : public Operator<Context> {
       const T* batch_mean_sum,
       const T* batch_var_sum,
       T* mean,
-      T* var);
+      T* var) {
+    const T scale = T(1) / static_cast<T>(num_batches_ * N * HxW);
+    EigenVectorArrayMap<T> mean_arr(mean, C);
+    EigenVectorArrayMap<T> var_arr(var, C);
+    mean_arr = ConstEigenVectorArrayMap<T>(batch_mean_sum, C) * scale;
+    var_arr = ConstEigenVectorArrayMap<T>(batch_var_sum, C) * scale -
+        mean_arr.square();
+  }
 
   template <typename T>
   void ComputeRunningMomentsAndFusedParam(
@@ -220,7 +236,19 @@ class SpatialBNOp : public Operator<Context> {
       T* running_var,
       T* rstd,
       T* alpha,
-      T* beta);
+      T* beta) {
+    const T a = T(1) - static_cast<T>(momentum_);
+    const T b = static_cast<T>(momentum_);
+    math::Axpby<T, T, Context>(C, a, mean, b, running_mean, &context_);
+    math::Axpby<T, T, Context>(C, a, var, b, running_var, &context_);
+    math::InvStd<T, Context>(C, static_cast<T>(epsilon_), var, rstd, &context_);
+    EigenVectorArrayMap<T> alpha_arr(alpha, C);
+    EigenVectorArrayMap<T> beta_arr(beta, C);
+    alpha_arr = ConstEigenVectorArrayMap<T>(scale, C) *
+        ConstEigenVectorArrayMap<T>(rstd, C);
+    beta_arr = ConstEigenVectorArrayMap<T>(bias, C) -
+        alpha_arr * ConstEigenVectorArrayMap<T>(mean, C);
+  }
 
   const bool is_test_;
   double epsilon_;
diff --git a/caffe2/operators/spatial_batch_norm_op_cudnn.cu b/caffe2/operators/spatial_batch_norm_op_cudnn.cu
index 39adc54a17c26..a5eacbf199d62 100644
--- a/caffe2/operators/spatial_batch_norm_op_cudnn.cu
+++ b/caffe2/operators/spatial_batch_norm_op_cudnn.cu
@@ -64,7 +64,12 @@ class CuDNNSpatialBNOp final : public SpatialBNOp<CUDAContext> {
       : SpatialBNOp<CUDAContext>(operator_def, ws),
         cudnn_wrapper_(&context_),
 #if CUDNN_VERSION_MIN(7, 0, 0)
-        mode_(CUDNN_BATCHNORM_SPATIAL_PERSISTENT) {
+        // TODO(T31829456): The new CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode was
+        // introduced in CuDNN 7 for performance optimization, but it results in
+        // accuracy losses in convolution models such as ResNeXt-101 and
+        // video R(2+1)D. We will fall back to the normal
+        // CUDNN_BATCHNORM_SPATIAL for now
+        mode_(CUDNN_BATCHNORM_SPATIAL) {
 #else
         mode_(CUDNN_BATCHNORM_SPATIAL) {
 #endif
@@ -225,7 +230,12 @@ class CuDNNSpatialBNGradientOp final : public SpatialBNGradientOp<CUDAContext> {
       : SpatialBNGradientOp<CUDAContext>(operator_def, ws),
         cudnn_wrapper_(&context_),
 #if CUDNN_VERSION_MIN(7, 0, 0)
-        mode_(CUDNN_BATCHNORM_SPATIAL_PERSISTENT) {
+        // TODO(T31829456): The new CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode was
+        // introduced in CuDNN 7 for performance optimization, but it results in
+        // accuracy losses in convolution models such as ResNeXt-101 and
+        // video R(2+1)D. We will fall back to the normal
+        // CUDNN_BATCHNORM_SPATIAL for now
+        mode_(CUDNN_BATCHNORM_SPATIAL) {
 #else
         mode_(CUDNN_BATCHNORM_SPATIAL) {
 #endif
diff --git a/caffe2/proto/caffe2_pb.h b/caffe2/proto/caffe2_pb.h
index 291cb6bfe82b2..0a08c8db241e9 100644
--- a/caffe2/proto/caffe2_pb.h
+++ b/caffe2/proto/caffe2_pb.h
@@ -44,9 +44,6 @@ inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) {
           ". If you have recently updated the caffe2.proto file to add a new "
           "device type, did you forget to update the ProtoToType() and TypeToProto"
           "function to reflect such recent changes?");
-      // The below code won't run but is needed to suppress some compiler
-      // warnings.
-      return DeviceType::ONLY_FOR_TEST;
   }
 }
 
@@ -77,9 +74,6 @@ inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) {
           ". If you have recently updated the caffe2.proto file to add a new "
           "device type, did you forget to update the ProtoToType() and TypeToProto"
           "function to reflect such recent changes?");
-      // The below code won't run but is needed to suppress some compiler
-      // warnings.
-      return PROTO_ONLY_FOR_TEST;
   }
 }
 
diff --git a/caffe2/python/modeling/get_entry_from_blobs.py b/caffe2/python/modeling/get_entry_from_blobs.py
index 49d7e8466162f..2b693114362f7 100644
--- a/caffe2/python/modeling/get_entry_from_blobs.py
+++ b/caffe2/python/modeling/get_entry_from_blobs.py
@@ -33,7 +33,8 @@ class GetEntryFromBlobs(NetModifier):
         blobs: list of blobs to get entry from
         logging_frequency: frequency for printing entry values to logs
         i1, i2: the first, second dimension of the blob. (currently, we assume
-        the blobs to be 2-dimensional blobs)
+        the blobs to be 2-dimensional blobs). When i2 = -1, print all entries
+        in blob[i1]
     """
 
     def __init__(self, blobs, logging_frequency, i1=0, i2=0):
@@ -41,13 +42,14 @@ def __init__(self, blobs, logging_frequency, i1=0, i2=0):
         self._logging_frequency = logging_frequency
         self._i1 = i1
         self._i2 = i2
-        self._field_name_suffix = '_{0}_{1}'.format(i1, i2)
+        self._field_name_suffix = '_{0}_{1}'.format(i1, i2) if i2 >= 0 \
+            else '_{0}_all'.format(i1)
 
     def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
                     modify_output_record=False):
 
         i1, i2 = [self._i1, self._i2]
-        if i1 < 0 or i2 < 0:
+        if i1 < 0:
             raise ValueError('index is out of range')
 
         for blob_name in self._blobs:
@@ -57,16 +59,20 @@ def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
                     blob, net.Name()))
 
             blob_i1 = net.Slice([blob], starts=[i1, 0], ends=[i1 + 1, -1])
-            blob_i1_i2 = net.Slice([blob_i1],
-                            net.NextScopedBlob(prefix=blob + '_{0}_{1}'.format(i1, i2)),
-                            starts=[0, i2], ends=[-1, i2 + 1])
+            if self._i2 == -1:
+                blob_i1_i2 = net.Copy([blob_i1],
+                    [net.NextScopedBlob(prefix=blob + '_{0}_all'.format(i1))])
+            else:
+                blob_i1_i2 = net.Slice([blob_i1],
+                    net.NextScopedBlob(prefix=blob + '_{0}_{1}'.format(i1, i2)),
+                    starts=[0, i2], ends=[-1, i2 + 1])
 
             if self._logging_frequency >= 1:
                 net.Print(blob_i1_i2, [], every_n=self._logging_frequency)
 
             if modify_output_record:
                 output_field_name = str(blob) + self._field_name_suffix
-                output_scalar = schema.Scalar((np.float, (1,)), blob_i1_i2)
+                output_scalar = schema.Scalar((np.float), blob_i1_i2)
 
                 if net.output_record() is None:
                     net.set_output_record(
diff --git a/caffe2/python/modeling/get_entry_from_blobs_test.py b/caffe2/python/modeling/get_entry_from_blobs_test.py
index f6023c642e0e2..8f4fbb864be10 100644
--- a/caffe2/python/modeling/get_entry_from_blobs_test.py
+++ b/caffe2/python/modeling/get_entry_from_blobs_test.py
@@ -60,7 +60,7 @@ def test_get_entry_from_blobs_modify_output_record(self):
 
         # no operator name set, will use default
         brew.fc(model, fc1, "fc2", dim_in=4, dim_out=4)
-        i1, i2 = np.random.randint(4, size=2)
+        i1, i2 = np.random.randint(4), np.random.randint(5) - 1
         net_modifier = GetEntryFromBlobs(
             blobs=['fc1_w', 'fc2_w'],
             logging_frequency=10,
@@ -74,10 +74,18 @@ def test_get_entry_from_blobs_modify_output_record(self):
         workspace.RunNetOnce(model.net)
 
         fc1_w = workspace.FetchBlob('fc1_w')
-        fc1_w_entry = workspace.FetchBlob('fc1_w_{0}_{1}'.format(i1, i2))
+        if i2 < 0:
+            fc1_w_entry = workspace.FetchBlob('fc1_w_{0}_all'.format(i1))
+        else:
+            fc1_w_entry = workspace.FetchBlob('fc1_w_{0}_{1}'.format(i1, i2))
 
-        self.assertEqual(fc1_w_entry.size, 1)
-        self.assertEqual(fc1_w_entry[0], fc1_w[i1][i2])
+        if i2 < 0:
+            self.assertEqual(fc1_w_entry.size, 4)
+            for j in range(4):
+                self.assertEqual(fc1_w_entry[0][j], fc1_w[i1][j])
+        else:
+            self.assertEqual(fc1_w_entry.size, 1)
+            self.assertEqual(fc1_w_entry[0], fc1_w[i1][i2])
 
         assert 'fc1_w' + net_modifier.field_name_suffix() in\
             model.net.output_record().field_blobs(),\
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index 9acbf1f9bc84c..79dadb091488e 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -518,7 +518,7 @@ def make_rnn(direction_offset):
                 pred_mh.net.VariableLengthSequencePadding(
                     [concatted_output, sequence_lens], [concatted_output])
             reshaped_output, _ = pred_mh.net.Reshape(concatted_output, [cls.dummy_name(), cls.dummy_name()], shape=[0,0,-1,2])
-            pred_mh.net.Transpose(reshaped_output, n.outputs[0], axes=[0,3,1,2])
+            pred_mh.net.Transpose(reshaped_output, n.outputs[0], axes=[0,2,1,3])
             for i in range(1, len(n.outputs)):
                 pred_mh.net.Concat([outputs_f[i], outputs_b[i]],
                                    [n.outputs[i], cls.dummy_name()], axis=0)
diff --git a/caffe2/python/pybind_state_nomni.cc b/caffe2/python/pybind_state_nomni.cc
index bca9b69706987..fbfe143f66cee 100644
--- a/caffe2/python/pybind_state_nomni.cc
+++ b/caffe2/python/pybind_state_nomni.cc
@@ -106,10 +106,10 @@ void addNomnigraphMethods(pybind11::module& m) {
           [](NNModule* nn) -> NNGraph* { return &nn->dataFlow; },
           py::return_value_policy::reference_internal)
       .def("convertToCaffe2Proto", [](NNModule& nn, py::object def) {
-        auto attr = def.attr("SerializeToString");
         CAFFE_ENFORCE(
-            attr, "convertToCaffe2Proto takes either no args", "a NetDef");
-        auto str = attr();
+            pybind11::hasattr(def, "SerializeToString"),
+            "convertToCaffe2Proto takes either no args", "a NetDef");
+        auto str = def.attr("SerializeToString")();
         caffe2::NetDef proto;
         proto.ParseFromString(py::bytes(str));
         auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
@@ -153,12 +153,11 @@ void addNomnigraphMethods(pybind11::module& m) {
       .def(
           "createNode",
           [](NNGraph* g, py::object op_def) {
-            auto attr = op_def.attr("SerializeToString");
             CAFFE_ENFORCE(
-                attr,
+                pybind11::hasattr(op_def, "SerializeToString"),
                 "createNode takes either OperatorDef",
                 "or ng.NeuralNetOperator");
-            auto str = attr();
+            auto str = op_def.attr("SerializeToString")();
             OperatorDef op;
             op.ParseFromString(py::bytes(str));
             if (op.input().size() || op.output().size()) {
diff --git a/caffe2/utils/Array.h b/caffe2/utils/Array.h
index ad9a80ed9203b..55c4a6a6e50af 100644
--- a/caffe2/utils/Array.h
+++ b/caffe2/utils/Array.h
@@ -291,13 +291,13 @@ constexpr inline array<T, N-1> tail(const array<T, N>& arg) {
 
 namespace detail {
 template<class T, size_t N, size_t... I>
-constexpr inline array<T, N+1> prepend_(T head, const array<T, N>& tail, guts::index_sequence<I...>) {
-  return {{std::move(head), get<I>(tail)...}};
+constexpr inline array<T, N+1> prepend_(T&& head, const array<T, N>& tail, guts::index_sequence<I...>) {
+  return {{std::forward<T>(head), get<I>(tail)...}};
 }
 }
 template<class T, size_t N>
-constexpr inline array<T, N+1> prepend(T head, const array<T, N>& tail) {
-  return detail::prepend_(std::move(head), tail, guts::make_index_sequence<N>());
+constexpr inline array<T, N+1> prepend(T&& head, const array<T, N>& tail) {
+  return detail::prepend_(std::forward<T>(head), tail, guts::make_index_sequence<N>());
 }
 
 /**
diff --git a/caffe2/utils/Array_test.cpp b/caffe2/utils/Array_test.cpp
index 1f3171ebe88eb..1d8c290b8a224 100644
--- a/caffe2/utils/Array_test.cpp
+++ b/caffe2/utils/Array_test.cpp
@@ -78,9 +78,11 @@ namespace test_tail {
     static_assert(array < int, 0 > {{}} == tail(array < int, 1 > {{3}}), "");
 }
 
-namespace test_prepend {
-    static_assert(array < int, 3 > {{2, 3, 4}} == prepend(2, array < int, 2 > {{3, 4}}), "");
-    static_assert(array < int, 1 > {{3}} == prepend(3, array < int, 0 > {{}}), "");
+TEST(ArrayTest, TestPrepend) {
+  // Some compilers can't handle move results as constexpr, so use
+  // gtest assert for this test
+  ASSERT_EQ((array<int, 3> {{2, 3, 4}}), (prepend(2, array<int, 2> {{3, 4}})));
+  ASSERT_EQ((array<int, 1> {{3}}), (prepend(3, array<int, 0> {{}})));
 }
 
 namespace test_to_std_array {
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 238dc3dff253f..67b65c0dcba74 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -185,9 +185,13 @@ endif()
 
 # ---[ Googletest and benchmark
 if(BUILD_TEST)
+  # Preserve build options.
   set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
+  set(TEMP_CMAKE_DEBUG_POSTFIX ${CMAKE_DEBUG_POSTFIX})
+
   # We will build gtest as static libs and embed it directly into the binary.
-  set(BUILD_SHARED_LIBS OFF)
+  set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE)
+
   # For gtest, we will simply embed it into our test binaries, so we won't
   # need to install it.
   set(BUILD_GTEST ON)
@@ -208,8 +212,10 @@ if(BUILD_TEST)
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/benchmark)
   include_directories(${CMAKE_CURRENT_LIST_DIR}/../third_party/benchmark/include)
 
-  # Recover the build shared libs option.
-  set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS})
+  # Recover build options. Unfortunately gtest modifies CMAKE_DEBUG_POSTFIX
+  # in some versions as detailed at https://github.com/google/googletest/issues/1334
+  set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)
+  set(CMAKE_DEBUG_POSTFIX ${TEMP_CMAKE_DEBUG_POSTFIX} CACHE BOOL "Debug postfix" FORCE)
 endif()
 
 # ---[ LMDB
@@ -406,11 +412,18 @@ if(BUILD_PYTHON)
 endif()
 
 # ---[ pybind11
-find_package(pybind11)
+find_package(pybind11 CONFIG)
+if((DEFINED pybind11_DIR) AND pybind11_DIR)
+  get_target_property(pybind11_INCLUDE_DIRS pybind11::pybind11 INTERFACE_INCLUDE_DIRECTORIES)
+else()
+  message("pybind11 config not found. Fallback to legacy find.")
+  find_package(pybind11)
+endif()
+
 if(pybind11_FOUND)
-  include_directories(SYSTEM ${pybind11_INCLUDE_DIRS})
+    include_directories(SYSTEM ${pybind11_INCLUDE_DIRS})
 else()
-  include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/pybind11/include)
+    include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/pybind11/include)
 endif()
 
 # ---[ MPI
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
index 78df03e8976b7..edc9b3ab3fda7 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake
@@ -561,12 +561,10 @@ else()
       set(c_compiler_realpath "")
     endif()
     set(CUDA_HOST_COMPILER "${c_compiler_realpath}" CACHE FILEPATH "Host side compiler used by NVCC")
-  elseif(MSVC AND ("${CMAKE_C_COMPILER}" MATCHES "clcache" OR "${CMAKE_C_COMPILER}" MATCHES "sccache"))
-    # NVCC does not think it will work if it is passed clcache.exe as the host
-    # compiler, which means that builds with CC=cl.exe won't work.  Best to just
-    # feed it whatever the actual cl.exe is as the host compiler.
-    #
-    # FYI: clcache works as the match, but clcache.exe does NOT.
+  elseif(MSVC AND "${CMAKE_C_COMPILER}" MATCHES "clcache|sccache")
+    # NVCC does not think it will work if it is passed clcache.exe or sccache.exe
+    # as the host compiler, which means that builds with CC=cl.exe won't work.
+    # Best to just feed it whatever the actual cl.exe is as the host compiler.
     set(CUDA_HOST_COMPILER "cl.exe" CACHE FILEPATH "Host side compiler used by NVCC")
   else()
     set(CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}"
@@ -575,7 +573,7 @@ else()
 endif()
 
 # Propagate the host flags to the host compiler via -Xcompiler
-option(CUDA_PROPAGATE_HOST_FLAGS "Propage C/CXX_FLAGS and friends to the host compiler via -Xcompile" ON)
+option(CUDA_PROPAGATE_HOST_FLAGS "Propagate C/CXX_FLAGS and friends to the host compiler via -Xcompile" ON)
 
 # Blacklisted flags to prevent propagation
 set(CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST  "" CACHE STRING "Blacklisted flags to prevent propagation")
@@ -981,7 +979,8 @@ if(NOT CUDA_VERSION VERSION_LESS "3.2")
     find_cuda_helper_libs(nvcuvid)
   endif()
 endif()
-if(CUDA_VERSION VERSION_GREATER "5.0")
+if(CUDA_VERSION VERSION_GREATER "5.0" AND CUDA_VERSION VERSION_LESS "9.2")
+  # In CUDA 9.2 cublas_device was deprecated
   find_cuda_helper_libs(cublas_device)
 endif()
 
@@ -1413,7 +1412,7 @@ macro(CUDA_WRAP_SRCS cuda_target format generated_files)
   else()
     set(CUDA_HOST_SHARED_FLAGS)
   endif()
-
+  
   macro(_filter_blacklisted_host_flags CUDA_FLAGS)
     string(REGEX REPLACE "[ \t]+" ";" ${CUDA_FLAGS} "${${CUDA_FLAGS}}")
     foreach(_blacklisted ${CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST})
@@ -1790,7 +1789,7 @@ function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options
       add_custom_command(
         OUTPUT ${output_file}
         DEPENDS ${object_files}
-        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} -dlink ${object_files} ${CUDA_cublas_device_LIBRARY} -o ${output_file}
+        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} -dlink ${object_files} -o ${output_file}
         ${flags}
         COMMENT "Building NVCC intermediate link file ${output_file_relative_path}"
         COMMAND_EXPAND_LISTS
@@ -1803,7 +1802,7 @@ function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options
         PRE_LINK
         COMMAND ${CMAKE_COMMAND} -E echo "Building NVCC intermediate link file ${output_file_relative_path}"
         COMMAND ${CMAKE_COMMAND} -E make_directory "${output_file_dir}"
-        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} ${flags} -dlink ${object_files} ${CUDA_cublas_device_LIBRARY} -o "${output_file}"
+        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} ${flags} -dlink ${object_files} -o "${output_file}"
         COMMAND_EXPAND_LISTS
         ${_verbatim}
         )
diff --git a/conda/integrated/build.sh b/conda/integrated/build.sh
index a485ac853fd9d..4862de466d642 100755
--- a/conda/integrated/build.sh
+++ b/conda/integrated/build.sh
@@ -51,10 +51,10 @@ fi
 # Build Caffe2 and PyTorch
 ###########################################################
 if [[ "$OSTYPE" == "darwin"* ]]; then
-  FULL_CAFFE2=1 MACOSX_DEPLOYMENT_TARGET=10.9 python setup.py install
+  USE_OPENCV=1 BUILD_BINARY=1 MACOSX_DEPLOYMENT_TARGET=10.9 python setup.py install
   exit 0
 fi
-FULL_CAFFE2=1 python setup.py install
+USE_OPENCV=1 BUILD_BINARY=1 python setup.py install
 
 
 
diff --git a/docs/cpp/Doxyfile b/docs/cpp/Doxyfile
index 7625ce44a5a56..a21dcf8184ad5 100644
--- a/docs/cpp/Doxyfile
+++ b/docs/cpp/Doxyfile
@@ -754,7 +754,6 @@ INPUT                  =  ../../torch/csrc/api/include \
                           ../../torch/csrc/api/src \
                           ../../aten/src/ATen/ATen.h \
                           ../../aten/src/ATen/Backend.h \
-                          ../../aten/src/ATen/Context.h \
                           ../../aten/src/ATen/Device.h \
                           ../../aten/src/ATen/DeviceGuard.h \
                           ../../aten/src/ATen/Layout.h \
@@ -842,7 +841,7 @@ EXCLUDE_PATTERNS       =
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
 
-EXCLUDE_SYMBOLS = c10::* caffe2::* cereal* DL* TH*
+EXCLUDE_SYMBOLS = c10::* caffe2::* cereal* DL* TH* cudnn*
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include
diff --git a/docs/source/jit.rst b/docs/source/jit.rst
new file mode 100644
index 0000000000000..35e62f52ecefd
--- /dev/null
+++ b/docs/source/jit.rst
@@ -0,0 +1,556 @@
+Torch Script
+============
+
+.. contents:: :local:
+
+.. automodule:: torch.jit
+.. currentmodule:: torch.jit
+
+Torch Script is a way to create serializable and optimizable models from PyTorch code.
+Anything code written in Torch Script can be saved from your Python
+process and loaded/run a process where there is no python dependency.
+
+We provide tools to incrementally transition a model from being a pure Python program
+to a Torch Script program that can be run independently from python, for instance, in a standalone C++ process.
+This makes it possible to train models in PyTorch using familiar tools and then export
+the model to a production environment where it is not a good idea to run models as python programs
+for performance and multi-threading reasons.
+
+Creating Torch Script Code
+--------------------------
+
+
+.. autoclass:: ScriptModule
+    :members:
+
+    .. method:: save(filename)
+
+       Save an offline version of this module for use in a separate process. The saved
+       module serializes all of the methods and parameters of this module. It can be
+       loaded into the C++ API using ``torch::jit::load(filename)`` or into the Python
+       API with ``torch.jit.load(filename)``.
+
+       To be able to save a module, it must not make any calls to native python functions.
+       This means that all submodules must be subclasses of ScriptModules as well.
+
+
+.. autofunction:: load
+
+.. autofunction:: trace
+
+
+Mixing Tracing and Scripting
+----------------------------
+
+In many cases either tracing or script is an easier approach for converting a model.
+We allow you to compose tracing and scripting to suite the particular requirements
+of a part of a model.
+
+Scripted functions can call traced ones. This is particularly useful when you need
+to use control-flow around a simple feed-forward model. For instance the beam search
+of a sequence to sequence model will typically be written in script but can call an
+encoder module generated using tracing.
+
+Example:
+
+::
+
+    import torch
+
+    def foo(x, y):
+        return 2 * x + y
+    traced_foo = torch.jit.trace(foo, (torch.rand(3), torch.rand(3)))
+
+    @torch.jit.script
+    def bar(x):
+        return traced_foo(x, x)
+
+Traced functions can call script functions. This is useful when a small part of
+a model requires some control-flow even though most of the model is just a feed-forward
+network. Control-flow inside of a script function called by a traced function is
+preserved correctly:
+
+Example:
+
+::
+
+    import torch
+    @torch.jit.script
+    def foo(x, y):
+        if x.max() > y.max():
+            r = x
+        else:
+            r = y
+        return r
+
+
+    def bar(x, y, z):
+        return foo(x, y) + z
+
+    traced_bar = torch.jit.trace(bar, (torch.rand(3), torch.rand(3), torch.rand(3))
+
+This composition also works for modules as well, where it can be used to generate
+a submodule using tracing that can be called from the methods of a script module:
+
+Example:
+
+::
+
+    import torch
+    import torchvision
+
+    class MyScriptModule(torch.jit.ScriptModule):
+        def __init__(self):
+            super(MyScriptModule, self).__init__()
+            self.means = torch.nn.Parameter(torch.tensor([103.939, 116.779, 123.68])
+                                            .resize_(1, 3, 1, 1))
+            self.resnet = torch.jit.trace(torchvision.models.resnet18(),
+                                          torch.rand(1, 3, 224, 224))
+
+        @torch.jit.script_method
+        def forward(self, input):
+            return self.resnet(input - self.means)
+
+
+Torch Script Language Reference
+-------------------------------
+
+Torch Script is a subset of Python that can either be written directly (using
+the @script annotations) or generated automatically from Python code via
+tracing. When using tracing, code is automatically converted into this subset of
+Python by recording only the actual operators on tensors and simply executing and
+discarding the other surrounding Python code.
+
+When writing Torch Script directly using @script annotations, the programmer must
+only use the subset of Python supported in Torch Script. This section documents
+what is supported in Torch Script as if it were a language reference for a stand
+alone language. Any features of Python not mentioned in this reference are not
+part of Torch Script.
+
+As a subset of Python any valid Torch Script function is also a valid Python
+function. This makes it possible to remove the @script annotations and debug the
+function using standard Python tools like pdb. The reverse is not true: there
+are many valid python programs that are not valid Torch Script programs.
+Instead, Torch Script focuses specifically on the features of Python that are
+needed to represent neural network models in Torch.
+
+.. envvar:: PYTORCH_JIT=1
+
+    Setting the environment variable ``PYTORCH_JIT=0`` will disable all script
+    and tracing annotations. If there is hard-to-debug error in one of your
+    ScriptModules, you can use this flag to force everything to run using native
+    Python. This allows the use of tools like ``pdb`` to debug code.
+
+
+Types
+~~~~~
+
+The largest difference between Torch Script and the full Python language is that
+Torch Script only support a small set of types that are needed to express neural
+net models. In particular Torch Script supports:
+
+``Tensor``
+    A PyTorch tensor of any dtype, dimension, or backend.
+
+``Tuple[T0, T1, ...]``
+    A tuple containing subtypes ``T0``, ``T1``, etc. (e.g. ``Tuple[Tensor, Tensor]``)
+
+``int``
+    A scalar integer
+
+``float``
+    A scalar floating point number
+
+``List[T]``
+    A list of which all members are type ``T``
+
+Unlike Python, each variable in Torch Script function must have a single static type.
+This makes it easier to optimize Torch Script functions.
+
+Example::
+
+    @torch.jit.script
+    def an_error(x):
+        if x:
+            r = torch.rand(1)
+        else:
+            r = 4
+        return r # Type mismatch: r is set to type Tensor in the true branch
+                 # and type int in the false branch
+
+By default, all parameters to a Torch Script function are assumed to be Tensor
+because this is the most common type used in modules. To specify that an
+argument to a Torch Script function is another type, it is possible to use
+MyPy-style type annotations using the types listed above:
+
+Example::
+
+    @torch.jit.script
+    def foo(x, tup):
+        # type: (int, Tuple[Tensor, Tensor]) -> Tensor
+        t0, t1 = tup
+        return t0 + t1 + x
+
+    print(foo(3, (torch.rand(3), torch.rand(3))))
+
+.. note::
+  It is also possible to annotate types with Python 3 type annotations.
+  In our examples, we use comment-based annotations to ensure Python 2
+  compatibility as well.
+
+Expressions
+~~~~~~~~~~~
+
+The following Python Expressions are supported
+
+Literals
+    ``True``, ``False``, ``None``, ``'string literals'``, ``"string literals"``,
+    number literals ``3`` (interpreted as int) ``3.4`` (interpreter as a float)
+
+Variables
+  ``a``
+
+  .. note::
+      See `Variable Resolution`_ for how variables are resolved.
+
+Tuple Construction
+    ``(3, 4)``, ``(3,)``
+
+List Construction
+    ``[3, 4]``, ``[]``, ``[torch.rand(3), torch.rand(4)]``
+
+    .. note::
+        an empty list is assumed have type ``List[Tensor]``.
+        The types of other list literals are derived from the type of the members.
+
+Arithmetic Operators
+  ``a + b``
+  ``a - b``
+  ``a * b``
+  ``a / b``
+  ``a ^ b``
+  ``a @ b``
+
+Comparison Operators
+  ``a == b``
+  ``a != b``
+  ``a < b``
+  ``a > b``
+  ``a <= b``
+  ``a >= b``
+
+Logical Operators
+  ``a and b``
+  ``a or b``
+  ``not b``
+
+Subscripts
+  ``t[0]``
+  ``t[-1]``
+  ``t[0:2]``
+  ``t[1:]``
+  ``t[:1]``
+  ``t[:]``
+  ``t[0, 1]``
+  ``t[0, 1:2]``
+  ``t[0, :1]``
+  ``t[-1, 1:, 0]``
+  ``t[1:, -1, 0]``
+  ``t[i:j, i]``
+
+  .. note::
+    Torch Script currently does not support mutating tensors in place, so any
+    tensor indexing can only appear on the right-hand size of an expression.
+
+Function calls
+   Calls to built-in functions: ``torch.rand(3, dtype=torch.int)``
+
+   Calls to other script functions:
+
+   ::
+
+        import torch
+
+        @torch.jit.script
+        def foo(x):
+          return x + 1
+
+        @torch.jit.script
+        def bar(x):
+          return foo(x)
+
+Method calls
+    Calls to methods of builtin types like tensor: ``x.mm(y)``
+
+
+    When defining a Script method inside of a ScriptModule, the ``@script_method``
+    annotation is used. Inside of these methods it is possible to call other methods
+    of this class or access methods on the submodules.
+
+    Calling a submodule directly (e.g. ``self.resnet(input)``) is equivalent to
+    calling its ``forward`` method (e.g. ``self.resnet.forward(input)``)
+
+    ::
+
+        import torch
+
+        class MyScriptModule(torch.jit.ScriptModule):
+            def __init__(self):
+                super(MyScriptModule, self).__init__()
+                self.means = torch.nn.Parameter(torch.tensor([103.939, 116.779, 123.68])
+                                                .resize_(1, 3, 1, 1))
+                self.resnet = torch.jit.trace(torchvision.models.resnet18(),
+                                              torch.rand(1, 3, 224, 224))
+
+            @torch.jit.script_method
+            def helper(self, input):
+              return self.resnet(input - self.means)
+
+            @torch.jit.script_method
+            def forward(self, input):
+                return self.helper(input)
+
+If expressions
+    ``x if x > y else y``
+
+Casts
+    ``float(ten)``, ``int(3.5)``, ``bool(ten)``
+
+Accessing Module Parameters
+    ``self.my_parameter`` ``self.my_submodule.my_parameter``
+
+
+Statements
+~~~~~~~~~~
+
+Torch Script supports the following types of statements:
+
+Simple Assignments
+
+    ::
+
+        a = b
+        a += b # short-hand for a = a + b, does not operate in-place on a
+        a -= b
+
+Pattern Matching Assignments
+
+    ::
+
+        a, b = tuple_or_list
+        a, b, *c = a_tuple
+
+Print Statements
+
+  ``print("the result of an add:", a + b)``
+
+If Statements
+
+    ::
+
+        if a < 4:
+            r = -a
+        elif a < 3:
+            r = a + a
+        else:
+            r = 3 * a
+
+While Loops
+
+  ::
+
+      a = 0
+      while a < 4:
+          print(a)
+          a += 1
+
+
+For loops with ``range``
+
+    ::
+
+        x = 0
+        for i in range(0, 10):
+            x *= i
+
+    .. note::
+      Script currently does not support iterating over generic iterable
+      objects like lists or tensors. This will be added in a future version.
+
+For loops over tuples:
+
+    ::
+
+        tup = (3, torch.rand(4))
+        for x in tup:
+            print(x)
+
+    .. note::
+      for loops over tuples will unroll the loop, generating a body for
+      each member of the tuple. The body must type-check correctly for each member.
+
+For loops over constant ``torch.nn.ModuleList``
+
+      ::
+
+          class SubModule(torch.jit.ScriptModule):
+              def __init__(self):
+                  super(Sub, self).__init__()
+                  self.weight = nn.Parameter(torch.randn(2))
+
+              @torch.jit.script_method
+              def forward(self, input):
+                  return self.weight + input
+
+          class MyModule(torch.jit.ScriptModule):
+              __constants__ = ['mods']
+
+              def __init__(self):
+                  super(MyModule, self).__init__()
+                  self.mods = torch.nn.ModuleList([SubModule() for i in range(10)])
+
+              @torch.jit.script_method
+              def forward(self, v):
+                  for module in self.mods:
+                      v = m(v)
+                  return v
+
+      .. note::
+          To use a module list inside a ``@script_method`` it must be marked
+          constant by adding the name of the attribute to the ``__constants__``
+          list for the type. For loops over a ModuleList will unroll the body of the
+          loop at compile time, with each member of the constant module list.
+
+Return
+    ``return a, b``
+
+    .. note::
+        there must be a return statement as the last member of the function
+        and return statements cannot appear anywhere else in the function. This
+        restriction will be removed in the future.
+
+Variable Resolution
+~~~~~~~~~~~~~~~~~~~
+
+Torch Script supports a subset of Python's variable resolution (i.e. scoping)
+rules. Local variables behave the same as in Python, except for the restriction
+that a variable must have the same type along all paths through a function.
+If a variable has a different type on different sides of an if statement, it
+is an error to use it after the end of the if statement.
+
+Similarly, a variable is not allowed to be used if it is only *defined* along some
+paths through the function.
+
+Example::
+
+    @torch.jit.script
+    def foo(x):
+        if x < 0:
+            y = 4
+        print(y) # Error: undefined value y
+
+Non-local variables are resolved to Python values at compile time when the
+function is defined. These values are then converted into Torch Script values using
+the rules described in `Use of Python Values`_.
+
+Use of Python Values
+~~~~~~~~~~~~~~~~~~~~
+
+To make writing Torch Script more convenient, we allow script code to refer
+to Python values in the surrounding scope. For instance, any time there is a
+reference to ``torch``, the Torch Script compiler is actually resolving it to the
+``torch`` Python module when the function is declared.  These Python values are
+not a first class part of Torch Script. Instead they are desugared at compile-time
+into the primitive types that Torch Script supports. This section describes the
+rules that are used when accessing Python values in Torch Script. They depend
+on the dynamic type of the python valued referenced.
+
+Functions
+  Torch Script can call python functions. This functionality is very useful when
+  incrementally converting a model into script. The model can be moved function-by-function
+  to script, leaving calls to Python functions in place. This way you can incrementally
+  check the correctness of the model as you go.
+
+  Example::
+
+      def foo(x):
+        print("I am called with {}".format(x))
+        import pdb; pdb.set_trace()
+        return x
+
+      @torch.jit.script
+      def bar(x)
+        return foo(x + 1)
+
+  .. note::
+    Attempting to call ``save`` on a ScriptModule that contains calls to Python
+    functions will fail. The intention is that this pathway is used for debugging
+    and the calls removed or turned into script functions before saving.
+
+
+Attribute Lookup On Python Modules
+    Torch Script can lookup attributes on modules. Builtin functions like ``torch.add``
+    are accessed this way. This allows Torch Script to call functions defined in
+    other modules.
+
+Python-defined Constants
+    Torch Script also provides a way to use constants that are defined in Python.
+    These can be used to hard-code hyper-parameters into the function, or to
+    define universal constants. There are two ways of specifying that a Python
+    value should be treated as a constant.
+
+    1. Values looked up as attributes of a module are assumed to be constant.
+       Example: ``math.pi``
+    2. Attributes of a ScriptModule can be marked constant by listing them
+       as a member of the ``__constants__`` property of the class:
+
+       Example::
+
+           class Foo(torch.jit.ScriptModule):
+               __constants__ = ['a']
+
+               def __init__(self):
+                   super(Foo, self).__init__(False)
+                   self.a = 1 + 4
+
+              @torch.jit.ScriptModule
+              def forward(self, input):
+                  return self.a + input
+
+    Supported constant Python Values are
+
+    * ``int``
+    * ``bool``
+    * ``torch.device``
+    * ``torch.layout``
+    * ``torch.dtype``
+    * tuples containing supported types
+    * ``torch.nn.ModuleList`` which can be used in a TorchScript for loop
+
+
+Debugging
+~~~~~~~~~
+
+Print things
+
+Use ``USE_PYTHON=0`` to debug in normal python mode
+
+Look at the graph
+
+Pay attention to tracer warnings
+
+
+Builtin Functions
+~~~~~~~~~~~~~~~~~
+
+Torch Script supports a subset of the builtin tensor and neural network functions that
+PyTorch provides. Most methods on Tensor as well as functions in the ``torch``
+namespace are available. Many functions in ``torch.nn.functional`` are also availiable.
+
+
+We currently do not provide any builtin ScriptModules e.g. a ``Linear`` or
+``Conv`` module. This functionality is something that will be developed in the future.
+For now we suggest using ``torch.jit.trace`` to transform standard ``torch.nn``
+modules into ScriptModules on construction.
+
+.. automodule:: torch.jit.supported_ops
diff --git a/docs/source/sparse.rst b/docs/source/sparse.rst
index 7694fe455b9a7..a329bb049baac 100644
--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@@ -82,7 +82,7 @@ An empty sparse tensor can be constructed by specifying its size:
     whether or not they are coalesced or not (e.g.,
     :func:`torch.sparse.FloatTensor._values` and
     :func:`torch.sparse.FloatTensor._indices`, as well as
-    :func:`torch.Tensor._sparse_mask`).  These operators are
+    :func:`torch.Tensor.sparse_mask`).  These operators are
     prefixed by an underscore to indicate that they reveal internal
     implementation details and should be used with care, since code
     that works with coalesced sparse tensors may not work with
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 468de8f5b9853..85f6232ff44b6 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -285,6 +285,7 @@ view of a storage and defines numeric operations on it.
    .. automethod:: masked_fill_
    .. automethod:: masked_select
    .. automethod:: matmul
+   .. automethod:: matrix_power
    .. automethod:: max
    .. automethod:: mean
    .. automethod:: median
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 18d21f0e2a1a9..31585d4a96977 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -169,7 +169,7 @@ Pointwise Ops
 .. autofunction:: cos
 .. autofunction:: cosh
 .. autofunction:: div
-.. autofunction:: digamma 
+.. autofunction:: digamma
 .. autofunction:: erf
 .. autofunction:: erfc
 .. autofunction:: erfinv
@@ -296,6 +296,7 @@ BLAS and LAPACK Operations
 .. autofunction:: logdet
 .. autofunction:: slogdet
 .. autofunction:: matmul
+.. autofunction:: matrix_power
 .. autofunction:: matrix_rank
 .. autofunction:: mm
 .. autofunction:: mv
diff --git a/scripts/onnx/install-develop.sh b/scripts/onnx/install-develop.sh
index 7b0502fa8b417..02888b11901aa 100755
--- a/scripts/onnx/install-develop.sh
+++ b/scripts/onnx/install-develop.sh
@@ -15,4 +15,4 @@ pip install -e "$tp2_dir/onnx"
 # Install caffe2 and pytorch
 pip install -r "$top_dir/caffe2/requirements.txt"
 pip install -r "$top_dir/requirements.txt"
-FULL_CAFFE2=1 python setup.py build_deps develop 
+USE_OPENCV=1 BUILD_BINARY=1 python setup.py build_deps develop
diff --git a/scripts/onnx/install.sh b/scripts/onnx/install.sh
index 21ce48a389659..7d4a3139d2e00 100755
--- a/scripts/onnx/install.sh
+++ b/scripts/onnx/install.sh
@@ -35,4 +35,4 @@ _pip_install -b "$BUILD_DIR/onnx" "file://$tp2_dir/onnx#egg=onnx"
 # Install caffe2 and pytorch
 pip install -r "$top_dir/caffe2/requirements.txt"
 pip install -r "$top_dir/requirements.txt"
-FULL_CAFFE2=1 python setup.py install
+USE_OPENCV=1 BUILD_BINARY=1 python setup.py install
diff --git a/setup.py b/setup.py
index cf1b0c9696ccd..3cc5c0c1642a0 100644
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,9 @@
 #   NO_CUDNN
 #     disables the cuDNN build
 #
+#   NO_TEST
+#     disables the test build
+#
 #   NO_MIOPEN
 #     disables the MIOpen build
 #
@@ -46,6 +49,12 @@
 #   USE_GLOO_IBVERBS
 #     toggle features related to distributed support
 #
+#   USE_OPENCV
+#     enables use of OpenCV for additional operators
+#
+#   BUILD_BINARY
+#     enables the additional binaries/ build
+#
 #   PYTORCH_BUILD_VERSION
 #   PYTORCH_BUILD_NUMBER
 #     specify the version of PyTorch, rather than the hard-coded version
@@ -117,25 +126,29 @@
 
 from tools.setup_helpers.env import check_env_flag, check_negative_env_flag
 
-# Before we run the setup_helpers, let's look for NO_* and WITH_*
-# variables and hotpatch the environment with the USE_* equivalent
-config_env_vars = ['CUDA', 'CUDNN', 'MIOPEN', 'MKLDNN', 'NNPACK', 'DISTRIBUTED',
-                   'SYSTEM_NCCL', 'GLOO_IBVERBS']
 
-
-def hotpatch_var(var):
+def hotpatch_var(var, prefix='USE_'):
     if check_env_flag('NO_' + var):
-        os.environ['USE_' + var] = '0'
+        os.environ[prefix + var] = '0'
     elif check_negative_env_flag('NO_' + var):
-        os.environ['USE_' + var] = '1'
+        os.environ[prefix + var] = '1'
     elif check_env_flag('WITH_' + var):
-        os.environ['USE_' + var] = '1'
+        os.environ[prefix + var] = '1'
     elif check_negative_env_flag('WITH_' + var):
-        os.environ['USE_' + var] = '0'
+        os.environ[prefix + var] = '0'
+
+# Before we run the setup_helpers, let's look for NO_* and WITH_*
+# variables and hotpatch environment with the USE_* equivalent
+use_env_vars = ['CUDA', 'CUDNN', 'MIOPEN', 'MKLDNN', 'NNPACK', 'DISTRIBUTED',
+                'OPENCV', 'SYSTEM_NCCL', 'GLOO_IBVERBS']
+list(map(hotpatch_var, use_env_vars))
 
-list(map(hotpatch_var, config_env_vars))
+# Also hotpatch a few with BUILD_* equivalent
+build_env_vars = ['BINARY', 'TEST']
+[hotpatch_var(v, 'BUILD_') for v in build_env_vars]
 
 from tools.setup_helpers.cuda import USE_CUDA, CUDA_HOME, CUDA_VERSION
+from tools.setup_helpers.build import BUILD_BINARY, BUILD_TEST, USE_OPENCV
 from tools.setup_helpers.rocm import USE_ROCM, ROCM_HOME, ROCM_VERSION
 from tools.setup_helpers.cudnn import (USE_CUDNN, CUDNN_LIBRARY,
                                        CUDNN_LIB_DIR, CUDNN_INCLUDE_DIR)
@@ -161,7 +174,6 @@ def hotpatch_var(var):
 IS_DARWIN = (platform.system() == 'Darwin')
 IS_LINUX = (platform.system() == 'Linux')
 
-FULL_CAFFE2 = check_env_flag('FULL_CAFFE2')
 BUILD_PYTORCH = check_env_flag('BUILD_PYTORCH')
 USE_CUDA_STATIC_LINK = check_env_flag('USE_CUDA_STATIC_LINK')
 
@@ -359,10 +371,13 @@ def build_libs(libs):
         build_libs_cmd += ['--use-mkldnn']
     if USE_GLOO_IBVERBS:
         build_libs_cmd += ['--use-gloo-ibverbs']
-    if FULL_CAFFE2:
-        build_libs_cmd += ['--full-caffe2']
 
     my_env["BUILD_TORCH"] = "ON"
+    my_env["BUILD_PYTHON"] = "ON"
+    my_env["BUILD_BINARY"] = "ON" if BUILD_BINARY else "OFF"
+    my_env["BUILD_TEST"] = "ON" if BUILD_TEST else "OFF"
+    my_env["INSTALL_TEST"] = "ON" if BUILD_TEST else "OFF"
+    my_env["USE_OPENCV"] = "ON" if USE_OPENCV else "OFF"
 
     try:
         os.mkdir('build')
@@ -892,9 +907,10 @@ def run(self):
     include_dirs += [tmp_install_path + "/include/THD"]
     main_link_args += [THD_LIB]
     if IS_LINUX and USE_CUDA:
-        extra_compile_args += ['-DUSE_C10D']
-        main_sources += ['torch/csrc/distributed/c10d/init.cpp']
-        main_link_args += [C10D_LIB]
+        extra_compile_args.append('-DUSE_C10D')
+        main_sources.append('torch/csrc/distributed/c10d/init.cpp')
+        main_sources.append('torch/csrc/distributed/c10d/ddp.cpp')
+        main_link_args.append(C10D_LIB)
 
 if USE_CUDA:
     nvtoolext_lib_name = None
@@ -1072,11 +1088,12 @@ def make_relative_rpath(path):
         name=str('caffe2.python.caffe2_pybind11_state'),
         sources=[]),
 )
-extensions.append(
-    setuptools.Extension(
-        name=str('caffe2.python.caffe2_pybind11_state_gpu'),
-        sources=[]),
-)
+if USE_CUDA:
+    extensions.append(
+        setuptools.Extension(
+            name=str('caffe2.python.caffe2_pybind11_state_gpu'),
+            sources=[]),
+    )
 
 cmdclass = {
     'create_version_file': create_version_file,
diff --git a/test/cpp/api/tensor.cpp b/test/cpp/api/tensor.cpp
index 375fd5819298c..f08a30d13c1e6 100644
--- a/test/cpp/api/tensor.cpp
+++ b/test/cpp/api/tensor.cpp
@@ -10,12 +10,12 @@
 
 template <typename T>
 bool exactly_equal(at::Tensor left, T right) {
-  return left._local_scalar().to<T>() == right;
+  return at::_local_scalar(left).to<T>() == right;
 }
 
 template <typename T>
 bool almost_equal(at::Tensor left, T right, T tolerance = 1e-4) {
-  return std::abs(left._local_scalar().to<T>() - right) < tolerance;
+  return std::abs(at::_local_scalar(left).to<T>() - right) < tolerance;
 }
 
 #define REQUIRE_TENSOR_OPTIONS(device_, index_, type_, layout_)                \
diff --git a/test/cpp/api/tensor_options.cpp b/test/cpp/api/tensor_options.cpp
index 9065f454297f0..ab80c5f45ab39 100644
--- a/test/cpp/api/tensor_options.cpp
+++ b/test/cpp/api/tensor_options.cpp
@@ -32,7 +32,7 @@ TEST_CASE("TensorOptions/DefaultsToTheRightValues") {
 
 TEST_CASE("TensorOptions/ReturnsTheCorrectType") {
   auto options = TensorOptions().device(kCPU).dtype(kInt).layout(kSparse);
-  REQUIRE(at::getMaybeVariableType(options) == getNonVariableType(Backend::SparseCPU, kInt));
+  REQUIRE(at::getType(options) == getNonVariableType(Backend::SparseCPU, kInt));
 }
 
 TEST_CASE("TensorOptions/UtilityFunctionsReturnTheRightTensorOptions") {
diff --git a/test/cpp_extensions/complex_registration_extension.cpp b/test/cpp_extensions/complex_registration_extension.cpp
new file mode 100644
index 0000000000000..4f7cd29cc5a18
--- /dev/null
+++ b/test/cpp_extensions/complex_registration_extension.cpp
@@ -0,0 +1,109 @@
+#include <torch/torch.h>
+
+#include <ATen/CPUFloatType.h>
+#include <ATen/Type.h>
+#include <ATen/core/VariableHooksInterface.h>
+#include <ATen/detail/ComplexHooksInterface.h>
+
+#include "ATen/Allocator.h"
+#include "ATen/CPUGenerator.h"
+#include "ATen/DeviceGuard.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/TensorImpl.h"
+#include "ATen/UndefinedTensor.h"
+#include "ATen/Utils.h"
+#include "ATen/WrapDimUtils.h"
+#include "ATen/core/Half.h"
+#include "ATen/core/optional.h"
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "ATen/Config.h"
+
+namespace at {
+
+struct CPUComplexFloatType : public at::CPUTypeDefault {
+  CPUComplexFloatType()
+      : CPUTypeDefault(
+            CPUTensorId(),
+            /*is_variable=*/false,
+            /*is_undefined=*/false) {}
+
+  ScalarType scalarType() const override;
+  Backend backend() const override;
+  const char* toString() const override;
+  size_t elementSizeInBytes() const override;
+  TypeID ID() const override;
+  Tensor& s_copy_(Tensor& self, const Tensor& src, bool non_blocking)
+      const override;
+  Tensor& _s_copy_from(const Tensor& self, Tensor& dst, bool non_blocking)
+      const override;
+
+  Tensor tensor(IntList size) const override {
+    // TODO: Upstream this
+    int64_t numel = 1;
+    for (auto s : size) {
+      numel *= s;
+    }
+    Storage s{c10::make_intrusive<StorageImpl>(
+        scalarTypeToDataType(ScalarType::ComplexFloat),
+        numel,
+        getCPUAllocator(),
+        /* resizable */ true)};
+    Tensor t{c10::make_intrusive<TensorImpl, UndefinedTensor>(
+        std::move(s),
+        at::CPUTensorId(),
+        /* is_variable */ false)};
+    return t;
+  }
+};
+
+struct ComplexHooks : public at::ComplexHooksInterface {
+  ComplexHooks(ComplexHooksArgs) {}
+  void registerComplexTypes(Context* context) const override {
+    context->registerType(
+        Backend::CPU, ScalarType::ComplexFloat, new CPUComplexFloatType());
+  }
+};
+
+ScalarType CPUComplexFloatType::scalarType() const {
+  return ScalarType::ComplexFloat;
+}
+
+Backend CPUComplexFloatType::backend() const {
+  return Backend::CPU;
+}
+
+const char* CPUComplexFloatType::toString() const {
+  return "CPUComplexFloatType";
+}
+TypeID CPUComplexFloatType::ID() const {
+  return TypeID::CPUComplexFloat;
+}
+
+size_t CPUComplexFloatType::elementSizeInBytes() const {
+  return sizeof(float);
+}
+
+Tensor& CPUComplexFloatType::s_copy_(
+    Tensor& dst,
+    const Tensor& src,
+    bool non_blocking) const {
+  AT_ERROR("not yet supported");
+}
+
+Tensor& CPUComplexFloatType::_s_copy_from(
+    const Tensor& src,
+    Tensor& dst,
+    bool non_blocking) const {
+  AT_ERROR("not yet supported");
+}
+
+REGISTER_COMPLEX_HOOKS(ComplexHooks);
+
+} // namespace at
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { }
diff --git a/test/expect/TestJit.test_export_expand_aten_fallback.expect b/test/expect/TestJit.test_export_expand_aten_fallback.expect
new file mode 100644
index 0000000000000..8baa082f72dbb
--- /dev/null
+++ b/test/expect/TestJit.test_export_expand_aten_fallback.expect
@@ -0,0 +1,37 @@
+ModelProto {
+  producer_name: "pytorch"
+  domain: ""
+  doc_string: ""
+  graph:
+    GraphProto {
+      name: "torch-jit-export"
+      inputs: [{name: "y.1", type:Tensor dims: 3 4 1}]
+      outputs: [{name: "5", type:Tensor dims: 3 4 4}]
+      initializers: []
+      nodes: [
+        Node {type: "Constant", inputs: [], outputs: [1], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Constant", inputs: [], outputs: [2], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Constant", inputs: [], outputs: [3], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Constant", inputs: [], outputs: [4], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+        Node {type: "Loop", inputs: [3,4,y.1], outputs: [5], attributes: [{ name: 'body', type: graph, value:
+            GraphProto {
+              name: "torch-jit-export1"
+              inputs: [{name: "i", type:Tensor dims: },{name: "cond", type:Tensor dims: },{name: "8", type:Tensor dims: }]
+              outputs: [{name: "15", type:Tensor dims: },{name: "16", type:Tensor dims: }]
+              initializers: []
+              nodes: [
+                Node {type: "Unsqueeze", inputs: [2], outputs: [9], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "Unsqueeze", inputs: [1], outputs: [10], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "Unsqueeze", inputs: [i], outputs: [11], attributes: [{ name: 'axes', type: ints, values: [0]}]},
+                Node {type: "Concat", inputs: [9,10,11], outputs: [12], attributes: [{ name: 'axis', type: int, value: 0}]},
+                Node {type: "Constant", inputs: [], outputs: [13], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]},
+                Node {type: "ATen", inputs: [y.1,12,13], outputs: [16], attributes: [{ name: 'operator', type: string, value: 'expand'}]},
+                Node {type: "Constant", inputs: [], outputs: [15], attributes: [{ name: 'value', type: tensor, value:TensorProto shape: []}]}
+              ]
+            }
+
+          }]}
+      ]
+    }
+  opset_import: [OperatorSetIdProto { domain: }],
+}
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 62252cfb4c1b9..faba5efb1847d 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -2473,6 +2473,47 @@ def test_symeig_no_eigenvectors(self):
         with self.assertRaisesRegex(RuntimeError, 'backward without computing eigenvectors'):
             torch.autograd.backward([w, v], [torch.ones_like(w), torch.ones_like(v)])
 
+    def test_no_grad_copy(self):
+        # create autograd function that saves grad pointer as class static
+        class MyFunc(Function):
+            static_grad_ptr = None
+
+            @staticmethod
+            def forward(ctx, inp1, inp2):
+                return inp1 + inp2
+
+            @staticmethod
+            def backward(ctx, grad):
+                MyFunc.static_grad_ptr = grad.data_ptr()
+                return grad, grad
+
+        class NonContGradFunc(Function):
+            @staticmethod
+            def forward(ctx, inp1):
+                ctx.size = inp1.size()
+                return torch.tensor([1.])
+
+            @staticmethod
+            def backward(ctx, grad):
+                return torch.ones(1).expand(ctx.size)
+
+        a = torch.randn(5, 6, requires_grad=True)
+        b = torch.randn(5, 6, requires_grad=True)
+        # non-contiguous grad should be copied
+        NonContGradFunc.apply(MyFunc.apply(a, b)).backward()
+        self.assertFalse(a.grad.data_ptr() == MyFunc.static_grad_ptr)
+        self.assertFalse(b.grad.data_ptr() == MyFunc.static_grad_ptr)
+        # test case that should trigger no copy for one of a,b
+        a.grad = b.grad = None
+        MyFunc.apply(a, b)[1][0].backward()
+        p_g = MyFunc.static_grad_ptr
+        p_a = a.grad.data_ptr()
+        p_b = b.grad.data_ptr()
+        # check a,b uses different grad buffer
+        self.assertFalse(p_a == p_b)
+        # check one of them is using the computed buffer
+        self.assertTrue(p_a == p_g or p_b == p_g)
+
 
 def index_variable(shape, max_indices):
     if not isinstance(shape, tuple):
@@ -2935,6 +2976,14 @@ class dont_convert(tuple):
     ('matmul', (S, S, M, M), ((S, S, M, S),), "4d_4d"),
     ('matmul', (S, S, M, M), ((M,),), "4d_1d"),
     ('matmul', (M,), ((S, S, M, S),), "1d_4d"),
+    ('matrix_power', (S, S), [2], "n=2"),
+    ('matrix_power', (S, S, S), [3], "n=3"),
+    ('matrix_power', (S, S, S), [1], "n=1"),
+    ('matrix_power', (S, S, S), [0], "n=0"),
+    ('matrix_power', lambda: random_fullrank_matrix_distinct_singular_value(S), [-1], "n=-1",
+     NO_ARGS, [skipIfNoLapack]),
+    ('matrix_power', lambda: random_fullrank_matrix_distinct_singular_value(S), [-3], "n=-3",
+     NO_ARGS, [skipIfNoLapack]),
     ('addcmul', (S, S), ((S, S), (S, S))),
     ('addcmul', (S, S), ((S, 1), (1, S)), 'broadcast_rhs'),
     ('addcmul', (1,), ((S, S, 1), (1, S)), 'broadcast_all'),
diff --git a/test/test_c10d.py b/test/test_c10d.py
index 395f8135f9922..755982246ac4a 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -3,6 +3,7 @@
 import multiprocessing
 import sys
 import tempfile
+import time
 import unittest
 from datetime import timedelta
 
@@ -24,7 +25,7 @@
     sys.exit(0)
 
 
-TIMEOUT_DEFAULT = 5
+TIMEOUT_DEFAULT = 15
 TIMEOUT_OVERRIDE = {}
 
 TestSkip = namedtuple('TestSkip', 'exit_code, message')
@@ -267,23 +268,31 @@ def _run(self, rank):
 
     def _join_processes(self, fn):
         timeout = get_timeout(self.id())
+        start_time = time.time()
         for p in self.processes:
             p.join(timeout)
-        self._check_return_codes()
+        elapsed_time = time.time() - start_time
+        self._check_return_codes(elapsed_time)
 
-    def _check_return_codes(self):
+    def _check_return_codes(self, elapsed_time):
         """
         Checks that the return codes of all spawned processes match, and skips
         tests if they returned a return code indicating a skipping condition.
         """
         first_process = self.processes[0]
-        for p in self.processes:
+        for i, p in enumerate(self.processes):
+            if p.exitcode is None:
+                raise RuntimeError('Process {} terminated or timed out after {} seconds'.format(i, elapsed_time))
             self.assertEqual(p.exitcode, first_process.exitcode)
         for skip in TEST_SKIPS.values():
             if first_process.exitcode == skip.exit_code:
                 raise unittest.SkipTest(skip.message)
         self.assertEqual(first_process.exitcode, 0)
 
+    @property
+    def is_master(self):
+        return self.rank == 0
+
 
 class ProcessGroupGlooTest(MultiProcessTestCase):
     def opts(self):
@@ -357,6 +366,42 @@ def allreduce(x, op):
         work.wait()
         self.assertEqual(torch.Tensor([float(self.world_size * (self.world_size + 1) / 2)]), x)
 
+    def test_send_recv_all_to_all(self):
+        store = c10d.FileStore(self.file.name)
+        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())
+
+        # Preallocate tensors for input/output
+        inputs = [torch.Tensor([self.rank]) for _ in range(self.world_size)]
+        outputs = [torch.Tensor([-1]) for _ in range(self.world_size)]
+
+        # Issue sends
+        send_work = []
+        for i in range(self.world_size):
+            if i == self.rank:
+                continue
+            send_work.append(pg.send([inputs[i]], i))
+
+        # Issue recvs
+        recv_work = []
+        for i in range(self.world_size):
+            if i == self.rank:
+                continue
+            recv_work.append(pg.recv([outputs[i]], i))
+
+        # Wait for sends to complete
+        for work in send_work:
+            work.wait()
+
+        # Wait for recvs to complete
+        for work in recv_work:
+            work.wait()
+
+        # Test that every output other than our own contains the respective rank
+        for i in range(self.world_size):
+            if i == self.rank:
+                continue
+            self.assertEqual(torch.Tensor([i]), outputs[i])
+
 
 class ProcessGroupNCCLTest(TestCase):
     MAIN_PROCESS_RANK = 0
@@ -571,7 +616,7 @@ def update_parameters(model):
 
     @skip_if_not_multigpu
     def test_gloo_backend(self):
-        store = c10d.TCPStore('localhost', self.port, self.rank == 0)
+        store = c10d.TCPStore('localhost', self.port, self.is_master)
         options = c10d.ProcessGroupGloo.Options()
         options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
@@ -580,10 +625,103 @@ def test_gloo_backend(self):
     @skip_if_not_multigpu
     @skip_if_not_nccl
     def test_nccl_backend(self):
-        store = c10d.TCPStore('localhost', self.port, self.rank == 0)
+        store = c10d.TCPStore('localhost', self.port, self.is_master)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
         self._test_ddp_with_process_group(process_group)
 
+    @skip_if_not_multigpu
+    def test_dist_broadcast_coalesced(self):
+        # Set up process group.
+        store = c10d.TCPStore('localhost', self.port, self.is_master)
+        options = c10d.ProcessGroupGloo.Options()
+        options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
+        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
+
+        device = torch.device('cuda')
+
+        target = torch.arange(10, dtype=torch.float64, device=device).chunk(5)
+
+        if self.is_master:
+            # All processes should have these tensors in the end.
+            tensors = target
+        else:
+            # Non-master processes start with empty tensors and should be
+            # filled with the tensors from the master.
+            tensors = torch.zeros(10, device=device).chunk(5)
+
+        c10d._dist_broadcast_coalesced(
+            tensors,
+            buffer_size=10,
+            process_group=process_group)
+
+        if not self.is_master:
+            self.assertEqual(tensors, target)
+
+    @skip_if_not_multigpu
+    def test_sync_params_no_buffers(self):
+        # Set up process group.
+        store = c10d.TCPStore('localhost', self.port, self.is_master)
+        options = c10d.ProcessGroupGloo.Options()
+        options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
+        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
+
+        # Use all available devices on every process here (data is small, so should be fine).
+        devices = gpus_for_rank(self.world_size)[self.rank]
+        target = torch.arange(10, dtype=torch.float64, device='cuda:0').chunk(5)
+        parameter_data = [target]
+        parameter_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]]
+        buffer_data = [[]] * len(parameter_data)
+
+        c10d._sync_params(
+            process_group,
+            parameter_data=parameter_data,
+            buffer_data=buffer_data,
+            devices=devices,
+            broadcast_bucket_size=10,
+            broadcast_buffers=False)
+
+        for device_data in parameter_data:
+            for i, parameter in enumerate(device_data):
+                self.assertEqual(parameter, target[i])
+
+    @skip_if_not_multigpu
+    def test_sync_params_with_buffers(self):
+        # Set up process group.
+        store = c10d.TCPStore('localhost', self.port, self.is_master)
+        options = c10d.ProcessGroupGloo.Options()
+        options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
+        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
+
+        devices = gpus_for_rank(self.world_size)[self.rank]
+        target = torch.arange(10, dtype=torch.float64, device='cuda:0').chunk(5)
+        parameter_data = [target]
+        parameter_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]]
+
+        # sync_params should do a dist_broadcast for buffers, so we only populate the master buffers and
+        # then check that other processes' tensors end up matching.
+
+        if self.is_master:
+            buffer_data = [target]
+            buffer_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]]
+        else:
+            buffer_data = [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices]
+
+        c10d._sync_params(
+            process_group,
+            parameter_data=parameter_data,
+            buffer_data=buffer_data,
+            devices=devices,
+            broadcast_bucket_size=10,
+            broadcast_buffers=True)
+
+        for device_data in parameter_data:
+            for i, parameter in enumerate(device_data):
+                self.assertEqual(parameter, target[i])
+
+        for device_data in buffer_data:
+            for i, buffer in enumerate(device_data):
+                self.assertEqual(buffer, target[i])
+
 if __name__ == '__main__':
     assert not torch.cuda._initialized, "test_distributed must not have initialized CUDA context on main process"
 
diff --git a/test/test_cpp_extensions.py b/test/test_cpp_extensions.py
index 4d35c0ced3c78..a5312cd038300 100755
--- a/test/test_cpp_extensions.py
+++ b/test/test_cpp_extensions.py
@@ -267,108 +267,9 @@ def test_lenient_flag_handling_in_jit_extensions(self):
         self.assertEqual(z, x.tanh() + y.tanh())
 
     def test_complex_registration(self):
-        cpp_source = '''
-        #include <ATen/detail/ComplexHooksInterface.h>
-        #include <ATen/detail/VariableHooksInterface.h>
-        #include <ATen/Type.h>
-        #include <ATen/CPUFloatType.h>
-
-        #include "ATen/TensorImpl.h"
-        #include "ATen/CPUGenerator.h"
-        #include "ATen/TensorImpl.h"
-        #include "ATen/Allocator.h"
-        #include "ATen/DeviceGuard.h"
-        #include "ATen/NativeFunctions.h"
-        #include "ATen/UndefinedTensor.h"
-        #include "ATen/Utils.h"
-        #include "ATen/WrapDimUtils.h"
-        #include "ATen/core/Half.h"
-        #include "ATen/core/optional.h"
-
-        #include <cstddef>
-        #include <functional>
-        #include <memory>
-        #include <utility>
-
-        #include "ATen/Config.h"
-
-        namespace at {
-
-        struct CPUComplexFloatType : public at::CPUTypeDefault {
-
-          CPUComplexFloatType()
-            : CPUTypeDefault(CPUTensorId(), /*is_variable=*/false, /*is_undefined=*/false) {}
-
-          ScalarType scalarType() const override;
-          Backend backend() const override;
-          const char * toString() const override;
-          size_t elementSizeInBytes() const override;
-          TypeID ID() const override;
-          Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const override;
-          Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const override;
-
-          Tensor tensor(IntList size) const override {
-            // TODO: Upstream this
-            int64_t numel = 1;
-            for (auto s : size) {
-              numel *= s;
-            }
-            Storage s{c10::make_intrusive<StorageImpl>(
-                scalarTypeToDataType(ScalarType::ComplexFloat),
-                numel,
-                getCPUAllocator(),
-                /* resizable */ true)};
-            Tensor t{c10::make_intrusive<TensorImpl, UndefinedTensor>(
-                std::move(s),
-                at::CPUTensorId(),
-                /* is_variable */ false)};
-            return t;
-          }
-        };
-
-        struct ComplexHooks : public at::ComplexHooksInterface {
-          ComplexHooks(ComplexHooksArgs) {}
-          void registerComplexTypes(Context* context) const override {
-            context->registerType(Backend::CPU, ScalarType::ComplexFloat, new CPUComplexFloatType());
-          }
-        };
-
-        ScalarType CPUComplexFloatType::scalarType() const {
-          return ScalarType::ComplexFloat;
-        }
-
-        Backend CPUComplexFloatType::backend() const {
-          return Backend::CPU;
-        }
-
-        const char * CPUComplexFloatType::toString() const {
-          return "CPUComplexFloatType";
-        }
-        TypeID CPUComplexFloatType::ID() const {
-          return TypeID::CPUComplexFloat;
-        }
-
-        size_t CPUComplexFloatType::elementSizeInBytes() const {
-          return sizeof(float);
-        }
-
-        Tensor & CPUComplexFloatType::s_copy_(Tensor & dst, const Tensor & src, bool non_blocking) const {
-          AT_ERROR("not yet supported");
-        }
-
-        Tensor & CPUComplexFloatType::_s_copy_from(const Tensor & src, Tensor & dst, bool non_blocking) const {
-          AT_ERROR("not yet supported");
-        }
-
-        REGISTER_COMPLEX_HOOKS(ComplexHooks);
-
-        } // namespace at
-        '''
-
-        module = torch.utils.cpp_extension.load_inline(
+        module = torch.utils.cpp_extension.load(
             name='complex_registration_extension',
-            cpp_sources=cpp_source,
-            functions=[],
+            sources='cpp_extensions/complex_registration_extension.cpp',
             verbose=True)
 
         torch.empty(2, 2, dtype=torch.complex64)
diff --git a/test/test_cuda.py b/test/test_cuda.py
index b174a63201eda..1ca7155dd098c 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -1460,6 +1460,10 @@ def test_pinverse(self):
     def test_matrix_rank(self):
         TestTorch._test_matrix_rank(self, lambda x: x.cuda())
 
+    @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
+    def test_matrix_power(self):
+        TestTorch._test_matrix_power(self, conv_fn=lambda t: t.cuda())
+
     @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
     def test_det_logdet_slogdet(self):
         TestTorch._test_det_logdet_slogdet(self, lambda t: t.cuda())
diff --git a/test/test_distributed.py b/test/test_distributed.py
index dd21a597e6680..c37eac2689e4b 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -241,6 +241,7 @@ def test_destroy_group(self):
         else:
             group = [0, 1]
         group_id = dist.new_group(group)
+        self._barrier()
         dist.destroy_process_group(group_id)
 
     # Test get rank and size of group
@@ -260,6 +261,7 @@ def test_get_rank_size_group(self):
     # Test destroy full groups
     def test_destroy_full_group(self):
         _, group_id, _ = self._init_full_group_test()
+        self._barrier()
         dist.destroy_process_group(group_id)
 
     # Test get rank and size of full group
@@ -269,7 +271,6 @@ def test_get_rank_size_full_group(self):
         self.assertEqual(dist.get_rank(group_id), dist.get_rank())
 
     # SEND RECV
-    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support send/recv")
     @unittest.skipIf(BACKEND == "nccl", "Nccl does not support send/recv")
     def test_send_recv(self):
         rank = dist.get_rank()
@@ -292,9 +293,6 @@ def test_send_recv(self):
         self._barrier()
 
     # SEND RECV ANY SOURCE
-    @unittest.skipIf(
-        BACKEND == "gloo", "Gloo does not support send/recv from any source"
-    )
     @unittest.skipIf(
         BACKEND == "nccl", "Nccl does not support send/recv from any source"
     )
@@ -325,7 +323,6 @@ def test_send_recv_any_source(self):
         self._barrier()
 
     # ISEND
-    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support isend")
     @unittest.skipIf(BACKEND == "nccl", "Nccl does not support isend")
     def test_isend(self):
         rank = dist.get_rank()
@@ -347,7 +344,6 @@ def test_isend(self):
         self._barrier()
 
     # IRECV
-    @unittest.skipIf(BACKEND == "gloo", "Gloo does not support irecv")
     @unittest.skipIf(BACKEND == "nccl", "Nccl does not support irecv")
     def test_irecv(self):
         rank = dist.get_rank()
@@ -1272,6 +1268,7 @@ def _run(self, rank):
             # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
             # We're retreiving a corresponding test and executing it.
             getattr(self, self.id().split(".")[2])()
+            self._barrier()
             dist.destroy_process_group()
             sys.exit(0)
 
diff --git a/test/test_distributions.py b/test/test_distributions.py
index c72f2bb2eca7d..3aa414a77515e 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -708,13 +708,15 @@ def _check_sampler_discrete(self, torch_dist, ref_dist, message,
         self.assertGreater(p, failure_rate, message)
 
     def _check_enumerate_support(self, dist, examples):
-        for param, expected in examples:
-            param = torch.tensor(param)
+        for params, expected in examples:
+            params = {k: torch.tensor(v) for k, v in params.items()}
             expected = torch.tensor(expected)
-            actual = dist(param).enumerate_support()
-            self.assertEqual(actual, expected)
-            actual = dist(param).enumerate_support()
+            d = dist(**params)
+            actual = d.enumerate_support(expand=False)
             self.assertEqual(actual, expected)
+            actual = d.enumerate_support(expand=True)
+            expected_with_expand = expected.expand((-1,) + d.batch_shape + d.event_shape)
+            self.assertEqual(actual, expected_with_expand)
 
     def test_repr(self):
         for Dist, params in EXAMPLES:
@@ -818,9 +820,9 @@ def ref_log_prob(idx, val, log_prob):
 
     def test_bernoulli_enumerate_support(self):
         examples = [
-            ([0.1], [[0], [1]]),
-            ([0.1, 0.9], [[0, 0], [1, 1]]),
-            ([[0.1, 0.2], [0.3, 0.4]], [[[0, 0], [0, 0]], [[1, 1], [1, 1]]]),
+            ({"probs": [0.1]}, [[0], [1]]),
+            ({"probs": [0.1, 0.9]}, [[0], [1]]),
+            ({"probs": [[0.1, 0.2], [0.3, 0.4]]}, [[[0]], [[1]]]),
         ]
         self._check_enumerate_support(Bernoulli, examples)
 
@@ -903,6 +905,14 @@ def test_binomial_log_prob_vectorized_count(self):
             expected = scipy.stats.binom(total_count.cpu().numpy(), probs.cpu().numpy()).logpmf(sample)
             self.assertAlmostEqual(log_prob, expected, places=4)
 
+    def test_binomial_enumerate_support(self):
+        examples = [
+            ({"probs": [0.1], "total_count": 2}, [[0], [1], [2]]),
+            ({"probs": [0.1, 0.9], "total_count": 2}, [[0], [1], [2]]),
+            ({"probs": [[0.1, 0.2], [0.3, 0.4]], "total_count": 3}, [[[0]], [[1]], [[2]], [[3]]]),
+        ]
+        self._check_enumerate_support(Binomial, examples)
+
     def test_binomial_extreme_vals(self):
         total_count = 100
         bin0 = Binomial(total_count, 0)
@@ -929,13 +939,6 @@ def test_binomial_vectorized_count(self):
         self.assertEqual(samples.mean(dim=0), bin1.mean, prec=0.02)
         self.assertEqual(samples.var(dim=0), bin1.variance, prec=0.02)
 
-    def test_binomial_enumerate_support(self):
-        set_rng_seed(0)
-        bin0 = Binomial(0, torch.tensor(1.))
-        self.assertEqual(bin0.enumerate_support(), torch.tensor([0.]))
-        bin1 = Binomial(torch.tensor(5), torch.tensor(0.5))
-        self.assertEqual(bin1.enumerate_support(), torch.arange(6))
-
     def test_negative_binomial(self):
         p = torch.tensor(torch.arange(0.05, 1, 0.1), requires_grad=True)
         for total_count in [1, 2, 10]:
@@ -1056,8 +1059,8 @@ def ref_log_prob(idx, val, log_prob):
 
     def test_categorical_enumerate_support(self):
         examples = [
-            ([0.1, 0.2, 0.7], [0, 1, 2]),
-            ([[0.1, 0.9], [0.3, 0.7]], [[0, 0], [1, 1]]),
+            ({"probs": [0.1, 0.2, 0.7]}, [0, 1, 2]),
+            ({"probs": [[0.1, 0.9], [0.3, 0.7]]}, [[0], [1]]),
         ]
         self._check_enumerate_support(Categorical, examples)
 
@@ -1086,8 +1089,8 @@ def test_one_hot_categorical_2d(self):
 
     def test_one_hot_categorical_enumerate_support(self):
         examples = [
-            ([0.1, 0.2, 0.7], [[1, 0, 0], [0, 1, 0], [0, 0, 1]]),
-            ([[0.1, 0.9], [0.3, 0.7]], [[[1, 0], [1, 0]], [[0, 1], [0, 1]]]),
+            ({"probs": [0.1, 0.2, 0.7]}, [[1, 0, 0], [0, 1, 0], [0, 0, 1]]),
+            ({"probs": [[0.1, 0.9], [0.3, 0.7]]}, [[[1, 0]], [[0, 1]]]),
         ]
         self._check_enumerate_support(OneHotCategorical, examples)
 
diff --git a/test/test_jit.py b/test/test_jit.py
index 93c9027123bd4..5eb7d4649bd76 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1583,6 +1583,97 @@ def foo(x):
         test_x = torch.rand(6, 3)
         self.assertEqual(foo(test_x), traced(test_x))
 
+    def test_export_expand_aten_fallback(self):
+        class ExpandTest(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                y = x
+                for i in range(5):
+                    y = x.expand([3, 4, i])
+                return y
+
+        mod = ExpandTest()
+        example_outs = mod(torch.rand(3, 4, 1))
+        f = io.BytesIO()
+        with self.assertRaisesRegex(RuntimeError, 'Could not export a broadcasted operation'):
+            torch.onnx.export_to_pretty_string(mod, (torch.rand(3, 4, 1),), f, verbose=False,
+                                               example_outputs=example_outs)
+
+        self.assertExpected(
+            torch.onnx.export_to_pretty_string(mod, (torch.rand(3, 4, 1),), f, verbose=False,
+                                               example_outputs=example_outs,
+                                               operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK))
+
+    def test_export_dropout(self):
+        test = torch.nn.Dropout()
+        test.eval()
+
+        traced = torch.jit.trace(test, (torch.rand(3, 4),), check_trace=False)
+        imported = self.getExportImportCopy(traced)
+        x = torch.randn(3, 4)
+        self.assertEqual(traced(x), imported(x))
+
+    def test_export_batchnorm(self):
+        for mode in ['eval', 'train']:
+            for clazz in [
+                    torch.nn.BatchNorm1d(100),
+                    torch.nn.BatchNorm1d(100, affine=False),
+                    torch.nn.BatchNorm2d(100),
+                    torch.nn.BatchNorm2d(100, affine=False)]:
+                getattr(clazz, mode)()
+
+                input = torch.randn(20, 100) if isinstance(clazz, torch.nn.BatchNorm1d) else \
+                    torch.randn(20, 100, 35, 45)
+
+                traced = torch.jit.trace(clazz, (input,))
+                imported = self.getExportImportCopy(traced)
+                x = torch.randn(20, 100) if isinstance(clazz, torch.nn.BatchNorm1d) else \
+                    torch.randn(20, 100, 35, 45)
+                self.assertEqual(traced(x), imported(x))
+
+    def test_export_rnn(self):
+        for clazz in [nn.RNN(10, 20, 2), nn.GRU(10, 20, 2)]:
+            class RNNTest(torch.nn.Module):
+                def __init__(self):
+                    super(RNNTest, self).__init__()
+                    self.rnn = clazz
+
+                def forward(self, x, lengths, h0):
+                    packed = torch.nn.utils.rnn.pack_padded_sequence(x, lengths)
+                    out, h = self.rnn(packed, h0)
+                    padded_outs, _ = torch.nn.utils.rnn.pad_packed_sequence(out)
+                    return padded_outs
+
+            test = RNNTest()
+
+            traced = torch.jit.trace(test, (torch.randn(5, 3, 10), torch.LongTensor([3, 2, 1]), torch.randn(2, 3, 20)))
+            imported = self.getExportImportCopy(traced)
+            x, lengths, h0 = torch.randn(5, 3, 10), torch.LongTensor([3, 3, 2]), torch.randn(2, 3, 20)
+            self.assertEqual(traced(x, lengths, h0), imported(x, lengths, h0))
+
+    def test_export_lstm(self):
+        class LSTMTest(torch.nn.Module):
+            def __init__(self):
+                super(LSTMTest, self).__init__()
+                self.rnn = nn.LSTM(10, 20, 2)
+
+            def forward(self, x, lengths, hiddens):
+                h0, c0 = hiddens
+                packed = torch.nn.utils.rnn.pack_padded_sequence(x, lengths)
+                out, (h, c) = self.rnn(packed, (h0, c0))
+                padded_outs, _ = torch.nn.utils.rnn.pad_packed_sequence(out)
+                return padded_outs
+
+        test = LSTMTest()
+
+        traced = torch.jit.trace(test, (torch.randn(5, 3, 10),
+                                        torch.LongTensor([3, 2, 1]),
+                                        (torch.randn(2, 3, 20), torch.randn(2, 3, 20))))
+        imported = self.getExportImportCopy(traced)
+        x, lengths, h0, c0 = \
+            torch.randn(5, 3, 10), torch.LongTensor([3, 3, 2]), torch.randn(2, 3, 20), torch.randn(2, 3, 20)
+        self.assertEqual(traced(x, lengths, (h0, c0)), imported(x, lengths, (h0, c0)))
+
 
 class TestBatched(TestCase):
     # generate random examples and create an batchtensor with them
@@ -3038,6 +3129,7 @@ def test_fuser_multiple_blocks(this, that, theother, meme):
         self.assertEqual(cu.test_fuser_multiple_blocks(*inputs), outputs)
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @unittest.skip("this test is flaky, see #11360")
     def test_scalar_fusion(self):
         def fn(x, y):
             return x + y.type_as(x)
@@ -6953,16 +7045,6 @@ def forward(self, x, y):
 EXCLUDE_TRACED = {
     'test_split_dim',
     'test_split_dim_neg0',
-
-    # nn functional test
-    # schema not found for onnx node
-    'test_nn_instance_norm',
-
-    # output no dependence with traced input, tracer confusion
-    'test_nn_rrelu',
-
-    # aten op has additional cudnn argument
-    'test_nn_group_norm',
 }
 
 # known to be failing in script
@@ -6991,9 +7073,6 @@ def forward(self, x, y):
     'test_norm_inf',
     'test_renorm_norm_inf',
     'test_split',
-    'test_split_size_list',
-    'test_split_size_list_dim',
-    'test_split_size_list_dim_neg0',
     'test_expand',
     'test_expand_1_element',
     'test_expand_new_dim',
@@ -7006,18 +7085,17 @@ def forward(self, x, y):
     'test_repeat_scalar',
     'test_repeat_single_number',
     'test_repeat_unsqueeze',
-    'test_reshape',
     'test_reshape_1d',
     'test_reshape_scalar_to_1d',
-    'test_reshape_size',
     'test_view',
     'test_view_1d',
     'test_view_scalar_to_1d',
-    'test_view_size',
     'test_split_dim',
     'test_split_dim_neg0',
     'test_gesv',
     'test_inverse',
+    'test_matrix_power_n=-1',  # involves inverse
+    'test_matrix_power_n=-3',  # involves inverse
     # skipped nn functional tests
     # ops involves sampling which could not test
     'test_nn_dropout',
@@ -7029,13 +7107,16 @@ def forward(self, x, y):
     'test_nn_adaptive_max_pool1d',
     'test_nn_adaptive_max_pool2d',
     'test_nn_adaptive_max_pool3d',
+    'test_nn_ctc_loss',
 
 
     # argument has custom behavior
     'test_nn_fractional_max_pool2d',
+    'test_nn_max_unpool3d',
     'test_nn_embedding',
     'test_nn_embedding_bag',
     'test_nn_batch_norm',
+
     # aten op has additional cudnn argument
     'test_nn_group_norm',
     'test_nn_nll_loss',
@@ -7065,6 +7146,8 @@ def forward(self, x, y):
     'test_nn_lp_pool1d',
     'test_nn_lp_pool2d',
     'test_nn_instance_norm',
+    'test_nn_grid_sample',
+    'test_nn_gumbel_softmax',
 }
 
 
@@ -7348,6 +7431,7 @@ def func(x):
     ('max_pool3d', (S, S, S, S, S), (2, 1)),
     ('max_unpool1d', torch.tensor([[[2., 4]]]), (torch.tensor([[[1, 3]]]), 2, 2, 0)),
     ('max_unpool2d', torch.tensor([[[[2., 4]]]]), (torch.tensor([[[[1, 3]]]]), 2, 2, 0)),
+    ('max_unpool3d', torch.tensor([[[[[2., 4]]]]]), (torch.tensor([[[[[1, 3]]]]]), 2, 2, 0)),
     ('lp_pool1d', (S, S, S), (2, 3, 2,)),
     ('lp_pool2d', (S, S, S, S), (2, 3, 2,)),
     ('adaptive_max_pool1d', (S, S, S), (5,)),
@@ -7410,24 +7494,27 @@ def func(x):
     ('normalize', (S, S, S), (),),
     ('unfold', (S, S, S, S), ([2, 3]),),
     ('fold', (1, 3 * 2 * 2, 12), ([4, 5], [2, 2]),),
-
-    # distribution sampling make result different in every run for below ops:
-    # ('gumbel_softmax', (S, S), (2,),),
-    #
-    #  No high order gradient for the below ops
-    # ('multilabel_margin_loss', torch.tensor([[0.2, -0.2, 0.07]]), (torch.tensor([[0, 0, 1]]),),),
-    # ('max_unpool3d', torch.tensor([[[[[2., 4]]]]]), (torch.tensor([[[[[1, 3]]]]]), 2, 2, 0)),
-    # ('grid_sample', (S, S, S, S), (non_differentiable(torch.rand(S, S, S, 2)),),),
-    # ('multi_margin_loss', (S, S), (non_differentiable(torch.randint(S, (S, ), dtype=torch.int64)),),),
-    # ('binary_cross_entropy', torch.randn(3, 2).sigmoid(), (non_differentiable(torch.rand(3, 2)),),),
-    # ('ctc_loss', torch.randn(S, S, S).log_softmax(2).detach().requires_grad_(), (torch.randint(1, S + 1, (S, S),
-    # dtype=torch.long), torch.full((S,), S, dtype=torch.long), torch.randint(1,S,(S,), dtype=torch.long))),
+    ('grid_sample', (S, S, S, S), (non_differentiable(torch.rand(S, S, S, 2)),),),
+    ('gumbel_softmax', (S, S), (2,),),
+    ('multilabel_margin_loss', torch.tensor([[0.2, -0.2, 0.07]]), (torch.tensor([[0, 0, 1]]),),),
+    ('multi_margin_loss', (S, S), (non_differentiable(torch.randint(S, (S, ), dtype=torch.int64)), \
+                                   1, 1, non_differentiable(torch.randn(S))),),
+    ('binary_cross_entropy', torch.randn(3, 2).sigmoid(), (non_differentiable(torch.rand(3, 2)), \
+                                                           non_differentiable(torch.randn(3, 2))),),
+    ('ctc_loss', torch.randn(S, S, S).log_softmax(2).detach().requires_grad_(), \
+     (torch.randint(1, S + 1, (S, S), dtype=torch.long), torch.full((S,), S, dtype=torch.long), \
+      torch.randint(1, S, (S,), dtype=torch.long))),
 ]
 
 
 # Test names in this set are only checked for a single derivative
 nn_functional_single_grad = frozenset('test_nn_' + name for name in [
     'pdist',
+    'multilabel_margin_loss',
+    'max_unpool3d',
+    'multi_margin_loss',
+    'binary_cross_entropy',
+    'ctc_loss',
 ])
 
 
diff --git a/test/test_nn.py b/test/test_nn.py
index f0a8f86017d96..b2597b894803f 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -4895,34 +4895,44 @@ def test_pairwise_distance(self):
         input2 = torch.randn(4, 4, requires_grad=True)
         self.assertTrue(gradcheck(lambda x, y: F.pairwise_distance(x, y), (input1, input2)))
 
+    @skipIfRocm
     def test_pdist(self):
-        for trans in [False, True]:
-            inp = torch.randn(4, 5, requires_grad=True)
+        for device, trans in itertools.product(device_(), [False, True]):
+            inp = torch.randn(4, 5, dtype=torch.double, device=device, requires_grad=True)
             if trans:
                 inp = inp.transpose(0, 1)
             for p in [0, 1, 2, 0.5, 1.5, 2.5, float('inf')]:
                 self.assertTrue(gradcheck(lambda x: F.pdist(x, p), (inp,)))
 
+    @skipIfRocm
     def test_pdist_zeros(self):
         """Test that grad is still valid when dist is 0"""
-        for trans in [False, True]:
-            inp = torch.randn(1, 3, requires_grad=True).repeat([2, 1])
+        for device in device_():
+            inp = torch.randn(1, 3, dtype=torch.double, device=device, requires_grad=True).repeat([2, 1])
             for p in [0, 1, 2, 0.5, 1.5, 2.5, float('inf')]:
                 self.assertTrue(gradcheck(lambda x: F.pdist(x, p), (inp,)))
 
+    @skipIfRocm
     def test_pdist_empty_row(self):
-        inp = torch.randn(1, 3, requires_grad=True)
-        self.assertTrue(gradcheck(F.pdist, (inp,)))
+        for device in device_():
+            inp = torch.randn(1, 3, dtype=torch.double, device=device, requires_grad=True)
+            self.assertTrue(gradcheck(F.pdist, (inp,)))
 
     def test_pdist_empty_col(self):
-        inp = torch.randn(4, 0, requires_grad=True)
-        self.assertTrue(gradcheck(F.pdist, (inp,)))
+        for device in device_():
+            inp = torch.randn(4, 0, dtype=torch.double, device=device, requires_grad=True)
+            self.assertTrue(gradcheck(F.pdist, (inp,)))
 
     @unittest.expectedFailure
-    def test_pdist_gradgrad_unimplemented(self):
+    def test_pdist_cpu_gradgrad_unimplemented(self):
         inp = torch.randn(4, 5, requires_grad=True)
         gradgradcheck(F.pdist, (inp,))
 
+    @unittest.expectedFailure
+    def test_pdist_cuda_gradgrad_unimplemented(self):
+        inp = torch.randn(4, 5, device='cuda', requires_grad=True)
+        gradgradcheck(F.pdist, (inp,))
+
     def test_cosine_embedding_loss_no_reduce(self):
         input1 = torch.randn(15, 10, requires_grad=True)
         input2 = torch.randn(15, 10, requires_grad=True)
diff --git a/test/test_sparse.py b/test/test_sparse.py
index fab6ad978176b..0b3ff39ab4830 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -736,7 +736,7 @@ def _test_sparse_mask_fixed(self):
             [17, 18, 19, 20],
         ])
         exp_v = self.ValueTensor([7, 14, 3, 20])
-        res = dense._sparse_mask(x)
+        res = dense.sparse_mask(x)
         expected = self.SparseTensor(i, exp_v, torch.Size([5, 4]))
         self.assertEqual(res, expected)
 
@@ -839,7 +839,7 @@ def _test_sparse_mask_hybrid_fixed(self):
             [[13, 5], [14, 1], [15, 1], [16, 6]],
             [[17, 7], [18, 2], [19, 7], [20, 1]],
         ])
-        res = dense._sparse_mask(x)
+        res = dense.sparse_mask(x)
         exp_v = self.ValueTensor([[7, 9], [14, 1], [3, 3], [20, 1]])
         expected = self.SparseTensor(i, exp_v, torch.Size([5, 4, 2]))
         self.assertEqual(res, expected)
diff --git a/test/test_torch.py b/test/test_torch.py
index 02ffe7f1048f7..ea0d69d7b0e26 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -189,6 +189,7 @@ def test_namespace(ns, *skips):
                        'is_coalesced',
                        'is_distributed',
                        'is_floating_point',
+                       'is_complex',
                        'is_nonzero',
                        'is_same_size',
                        'is_signed',
@@ -216,6 +217,7 @@ def test_namespace(ns, *skips):
                        'to_dense',
                        'sparse_resize_',
                        'sparse_resize_and_clear_',
+                       'sparse_mask',
                        )
         test_namespace(torch.nn)
         test_namespace(torch.nn.functional, 'assert_int_or_pair', 'bilinear', 'feature_alpha_dropout')
@@ -1080,7 +1082,7 @@ def test_pairwise_distance_empty(self):
 
     @skipIfRocm
     def test_pdist_empty(self):
-        devices = ['cpu']
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
             shape = (0, 2)
             x = torch.randn(shape, device=device)
@@ -1098,7 +1100,7 @@ def test_pdist_empty(self):
     @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
     def test_pdist_scipy(self):
         from scipy.spatial.distance import pdist
-        devices = ['cpu']
+        devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
             for shape in [(4, 5), (3, 2), (2, 1)]:
                 for p in [0, 1, 2, 3, 1.5, 2.5, float('inf')]:
@@ -1109,13 +1111,13 @@ def test_pdist_scipy(self):
                         actual = torch.pdist(x, p=p)
                         # pdist doesn't handle 0 or inf norm properly
                         if p == 0:
-                            expected = pdist(x, 'hamming') * x.shape[1]
+                            expected = pdist(x.cpu(), 'hamming') * x.shape[1]
                         elif p == float('inf'):
-                            expected = pdist(x, lambda a, b: np.abs(a - b).max())
+                            expected = pdist(x.cpu(), lambda a, b: np.abs(a - b).max())
                         else:
-                            expected = pdist(x, 'minkowski', p=p)
+                            expected = pdist(x.cpu(), 'minkowski', p=p)
                         self.assertEqual(expected.shape, actual.shape)
-                        self.assertTrue(np.allclose(expected, actual.numpy()))
+                        self.assertTrue(np.allclose(expected, actual.cpu().numpy()))
 
     @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
     def test_logsumexp(self):
@@ -2510,6 +2512,24 @@ def test_as_tensor(self):
         self.assertEqual(torch.tensor(x), torch.as_tensor(x))
         self.assertEqual(torch.tensor(x, dtype=torch.float32), torch.as_tensor(x, dtype=torch.float32))
 
+        # python data with heterogeneous types
+        z = [0, 'torch']
+        with self.assertRaisesRegex(TypeError, "invalid data type"):
+            torch.tensor(z)
+            torch.as_tensor(z)
+
+        # python data with self-referential lists
+        z = [0]
+        z += [z]
+        with self.assertRaisesRegex(TypeError, "self-referential lists are incompatible"):
+            torch.tensor(z)
+            torch.as_tensor(z)
+
+        z = [[1, 2], z]
+        with self.assertRaisesRegex(TypeError, "self-referential lists are incompatible"):
+            torch.tensor(z)
+            torch.as_tensor(z)
+
         # from tensor (doesn't copy unless type is different)
         y = torch.tensor(x)
         self.assertIs(y, torch.as_tensor(y))
@@ -2925,6 +2945,20 @@ def test_arange_inference(self):
                                    torch.tensor(1, dtype=torch.int16)).dtype)
         torch.set_default_dtype(saved_dtype)
 
+    def test_randint_inference(self):
+        size = (2, 1)
+        for args in [(3,), (1, 3)]:  # (low,) and (low, high)
+            self.assertIs(torch.int64, torch.randint(*args, size=size).dtype)
+            self.assertIs(torch.int64, torch.randint(*args, size=size, layout=torch.strided).dtype)
+            self.assertIs(torch.int64, torch.randint(*args, size=size, generator=torch.default_generator).dtype)
+            self.assertIs(torch.float32, torch.randint(*args, size=size, dtype=torch.float32).dtype)
+            out = torch.empty(size, dtype=torch.float32)
+            self.assertIs(torch.float32, torch.randint(*args, size=size, out=out).dtype)
+            self.assertIs(torch.float32, torch.randint(*args, size=size, out=out, dtype=torch.float32).dtype)
+            out = torch.empty(size, dtype=torch.int64)
+            self.assertIs(torch.int64, torch.randint(*args, size=size, out=out).dtype)
+            self.assertIs(torch.int64, torch.randint(*args, size=size, out=out, dtype=torch.int64).dtype)
+
     @staticmethod
     def _select_broadcastable_dims(dims_full=None):
         # select full dimensionality
@@ -4562,6 +4596,49 @@ def run_test(M):
     def test_pinverse(self):
         self._test_pinverse(self, conv_fn=lambda x: x)
 
+    @staticmethod
+    def _test_matrix_power(self, conv_fn):
+        def run_test(M, sign=1):
+            if sign == -1:
+                M = M.inverse()
+            MP2 = torch.matrix_power(M, 2)
+            self.assertEqual(MP2, torch.matmul(M, M))
+
+            MP3 = torch.matrix_power(M, 3)
+            self.assertEqual(MP3, torch.matmul(MP2, M))
+
+            MP4 = torch.matrix_power(M, 4)
+            self.assertEqual(MP4, torch.matmul(MP2, MP2))
+
+            MP6 = torch.matrix_power(M, 6)
+            self.assertEqual(MP6, torch.matmul(MP3, MP3))
+
+            MP0 = torch.matrix_power(M, 0)
+            self.assertEqual(MP0, torch.eye(M.size(-2)).expand_as(M))
+
+        # Single matrix
+        M = conv_fn(torch.randn(5, 5))
+        run_test(M)
+
+        # Batch matrices
+        M = conv_fn(torch.randn(3, 3, 3))
+        run_test(M)
+
+        # Many batch matrices
+        M = conv_fn(torch.randn(2, 3, 3, 3))
+        run_test(M)
+
+        # Single matrix, but full rank
+        # This is for negative powers
+        from test_autograd import random_fullrank_matrix_distinct_singular_value
+        M = conv_fn(random_fullrank_matrix_distinct_singular_value(5))
+        run_test(M)
+        run_test(M, sign=-1)
+
+    @skipIfNoLapack
+    def test_matrix_power(self):
+        self._test_matrix_power(self, conv_fn=lambda x: x)
+
     @staticmethod
     def _test_det_logdet_slogdet(self, conv_fn):
         def reference_det(M):
diff --git a/third_party/gloo b/third_party/gloo
index a9c745637c60e..aa0d2e3f8aa4f 160000
--- a/third_party/gloo
+++ b/third_party/gloo
@@ -1 +1 @@
-Subproject commit a9c745637c60e99a75c8cc8eeeb90ff19d18b176
+Subproject commit aa0d2e3f8aa4f9cee5ffa46070491cf1ed6aae70
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 67d9634b9babc..8accdb97427b2 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -765,12 +765,12 @@
 - name: zero_(Tensor self)
   self: zeros_like(grad)
 
-- name: _sparse_mask(Tensor self, SparseTensorRef mask)
-  self: not_implemented("_sparse_mask")
-  mask: not_implemented("_sparse_mask")
+- name: sparse_mask(Tensor self, SparseTensorRef mask)
+  self: not_implemented("sparse_mask")
+  mask: not_implemented("sparse_mask")
 
 - name: _standard_gamma(Tensor self, Generator generator)
-  self: grad * self._standard_gamma_grad(result)
+  self: grad * _standard_gamma_grad(self, result)
 
 - name: _standard_gamma_grad(Tensor self, Tensor output)
   self: not_implemented("_standard_gamma_grad")
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 2b2999335e70d..b3ea70aa87222 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -26,7 +26,7 @@
     'index',
     '_indexCopy_', 'max_values', 'min_values', 'argmax', 'argmin',
     '_cumsum.*', '_cumprod.*', '_sum.*', '_prod.*', '_th_.*',
-    'arange.*', 'range.*', '_gesv.*', '_getri.*', 'slice',
+    'arange.*', 'range.*', '_gesv.*', '_getri.*', 'slice', 'randint(_out)?',
     '_local_scalar', '_local_scalar_dense',
     'max_pool1d', 'max_pool2d', 'max_pool3d', 'linear'
 ]
@@ -151,10 +151,10 @@ def should_generate_python_binding(declaration):
     # TODO: fix handling of SparseTensor. We don't want to generate Python
     # bindings to SparseTensor overloads, such as add(Tensor, SparseTensorRef),
     # since the Tensor-based signature already dynamically dispatches correctly.
-    # However, _sparse_mask only has a SparseTensor signature so we need to bind
+    # However, sparse_mask only has a SparseTensor signature so we need to bind
     # that function.
     for arg in declaration['arguments']:
-        if arg['type'] == 'SparseTensorRef' and declaration['name'] != '_sparse_mask':
+        if arg['type'] == 'SparseTensorRef' and declaration['name'] != 'sparse_mask':
             return False
 
     return True
@@ -427,7 +427,7 @@ def append_actuals_formals(actual, formal):
         env['actuals'] = actuals
 
         if has_tensor_options:
-            env['initialize_cuda'] = 'maybe_initialize_cuda(at::getMaybeVariableType(options));'
+            env['initialize_cuda'] = 'maybe_initialize_cuda(at::getType(options));'
         else:
             env['initialize_cuda'] = 'maybe_initialize_cuda({});'.format(type_args[0]['name']) if type_args else ''
 
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 05affcbaa6b71..d6bcb0821e83c 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -31,7 +31,7 @@
 
 # These functions are written manually in templates/VariableType.cpp
 MANUAL_IMPLEMENTATIONS = {
-    'contiguous', 'resize_', 'resize_as_'
+    'contiguous', 'resize_', 'resize_as_', 'detach', 'detach_',
 }
 
 # These functions we don't want to record for tracing, because we always want
diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp
index cc4695f3df504..caf1df4bda722 100644
--- a/tools/autograd/templates/Functions.cpp
+++ b/tools/autograd/templates/Functions.cpp
@@ -531,7 +531,7 @@ Tensor _fused_dropout_backward(Tensor grad, Tensor mask, double p1m) {
     // Use autograd-friendly backward if double backward is required
     return grad * (mask.type_as(grad) * (1. / p1m));
   } else {
-    return grad._masked_scale(mask, 1. / p1m);
+    return at::_masked_scale(grad, mask, 1. / p1m);
   }
 }
 
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index c2d7cc39bdeae..d4a9a4eccab86 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -18,7 +18,7 @@
 #include "torch/csrc/utils/variadic.h"
 #include "torch/csrc/autograd/functions/utils.h"
 
-#include <ATen/detail/VariableHooksInterface.h>
+#include <ATen/core/VariableHooksInterface.h>
 
 #include <array>
 #include <cstddef>
@@ -127,7 +127,7 @@ struct VariableTypeRegistry {
 
 struct VariableHooks : public at::VariableHooksInterface {
   VariableHooks(at::VariableHooksArgs) {}
-  void registerVariableTypeFor(at::Context*, at::Backend, at::ScalarType) const override;
+  void registerVariableTypeFor(at::LegacyTypeDispatch*, at::Backend, at::ScalarType) const override;
   at::Type& getVariableTypeFromBaseType(const at::Type&) const override;
 };
 
@@ -161,7 +161,7 @@ static VariableTypeRegistry registry;
 REGISTER_VARIABLE_HOOKS(VariableHooks)
 
 // Pre-condition: backend/scalar_type is a valid type in the type_registry
-void VariableHooks::registerVariableTypeFor(at::Context* context, at::Backend backend, at::ScalarType scalar_type) const {
+void VariableHooks::registerVariableTypeFor(at::LegacyTypeDispatch* context, at::Backend backend, at::ScalarType scalar_type) const {
   auto* baseType = context->getNonVariableTypeRaw(backend, scalar_type);
   register_variable_type_for(baseType);
 }
@@ -378,6 +378,13 @@ static bool isFloatingPoint(ScalarType s) {
   return s == kFloat || s == kDouble || s == kHalf;
 }
 
+void VariableType::backward(Tensor & self, at::optional<Tensor> gradient, bool keep_graph, bool create_graph) const {
+  as_variable_ref(self).backward(gradient, keep_graph, create_graph);
+}
+
+void VariableType::set_data(Tensor & self, Tensor new_data) const {
+  as_variable_ref(self).set_data(new_data);
+}
 Tensor & VariableType::s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const {
   jit::Node* node = nullptr;
   if(torch::jit::tracer::isTracing()) {
@@ -446,6 +453,46 @@ Tensor VariableType::contiguous(const Tensor & self) const {
   return self.clone();
 }
 
+Tensor VariableType::detach(const Tensor & self) const {
+  profiler::RecordFunction profiler("detach");
+  torch::jit::Node* node = nullptr;
+  if (jit::tracer::isTracing()) {
+    auto& graph = jit::tracer::getTracingState()->graph;
+    node = graph->create(jit::aten::detach, /*outputs=*/0);
+    jit::tracer::recordSourceLocation(node);
+    jit::tracer::addInputs(node, "self", self);
+    graph->appendNode(node);
+
+  }
+  // <NON_GENERATED_CODE>
+  auto result = as_variable_ref(const_cast<Tensor&>(self)).detach();
+  // </NON_GENERATED_CODE>
+  if (jit::tracer::isTracing()) {
+    jit::tracer::addOutput(node, result);
+  }
+  return result;
+}
+
+Tensor & VariableType::detach_(Tensor & self) const {
+  profiler::RecordFunction profiler("detach_");
+  torch::jit::Node* node = nullptr;
+  if (jit::tracer::isTracing()) {
+    auto& graph = jit::tracer::getTracingState()->graph;
+    node = graph->create(jit::aten::detach, /*outputs=*/0);
+    jit::tracer::recordSourceLocation(node);
+    jit::tracer::addInputs(node, "self", self);
+    graph->appendNode(node);
+    jit::tracer::ensureUnique("detach_", self);
+  }
+  // <NON_GENERATED_CODE>
+  as_variable_ref(self).detach_();
+  // </NON_GENERATED_CODE>
+  if (jit::tracer::isTracing()) {
+    jit::tracer::addOutput(node, self);
+  }
+  return self;
+}
+
 static std::vector<std::vector<int64_t>> to_args_sizes(TensorList tensors) {
   std::vector<std::vector<int64_t>> args_sizes(tensors.size());
   for (size_t i = 0; i < tensors.size(); ++i) {
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index b307a1459da1d..fe3e57f4fc024 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -58,6 +58,10 @@ struct TORCH_API VariableType final : public at::TypeDefault {
 
   Tensor & s_copy_(Tensor & self, const Tensor & src, bool non_blocking) const override;
   Tensor & _s_copy_from(const Tensor & self, Tensor & dst, bool non_blocking) const override;
+
+  void backward(Tensor & self, at::optional<Tensor> gradient, bool keep_graph, bool create_graph) const override;
+  void set_data(Tensor & self, Tensor new_data) const override;
+
   ${type_derived_method_declarations}
 
 private:
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index f94f3926445d3..144771290d1c8 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -53,9 +53,8 @@ static void check_out_type_matches(Tensor result,
   const auto& type = torch::getVariableType(scalarType_arg, layout_arg, device_type_arg);
   if (result.type() != type) {
     AT_ERROR(
-        "type corresponding to %s does not match type of out parameter (%s)",
-        type.toString(),
-        result.type().toString());
+        "type corresponding to ", type.toString(),
+        " does not match type of out parameter (", result.type().toString(), ")");
   }
 }
 
@@ -65,7 +64,7 @@ inline Tensor dispatch_arange(Scalar end, Tensor result) {
 }
 
 inline Tensor dispatch_arange(Scalar end, const TensorOptions& options) {
-  maybe_initialize_cuda(at::getMaybeVariableType(options));
+  maybe_initialize_cuda(at::getType(options));
   AutoNoGIL no_gil;
   return torch::arange(end, options);
 }
@@ -76,7 +75,7 @@ inline Tensor dispatch_arange(Scalar start, Scalar end, Scalar step, Tensor resu
 }
 
 inline Tensor dispatch_arange(Scalar start, Scalar end, Scalar step, const TensorOptions& options) {
-  maybe_initialize_cuda(at::getMaybeVariableType(options));
+  maybe_initialize_cuda(at::getType(options));
   AutoNoGIL no_gil;
   return torch::arange(start, end, step, options);
 }
@@ -147,7 +146,7 @@ inline Tensor dispatch_range(Scalar start, Scalar end, Scalar step, Tensor resul
 }
 
 inline Tensor dispatch_range(Scalar start, Scalar end, Scalar step, const TensorOptions& options) {
-  maybe_initialize_cuda(at::getMaybeVariableType(options));
+  maybe_initialize_cuda(at::getType(options));
   AutoNoGIL no_gil;
   DeviceGuard device_guard(options.device());
   return torch::range(start, end, step, options);
@@ -184,6 +183,140 @@ static PyObject * THPVariable_range(PyObject* self, PyObject* args, PyObject* kw
   END_HANDLE_TH_ERRORS
 }
 
+inline Tensor dispatch_randint(int64_t high, IntList size, Generator * generator, Tensor result) {
+  AutoNoGIL no_gil;
+  return at::randint_out(result, high, size, generator);
+}
+inline Tensor dispatch_randint(int64_t high, IntList size, Generator * generator, const TensorOptions & options) {
+  maybe_initialize_cuda(at::getType(options));
+  AutoNoGIL no_gil;
+  return torch::randint(high, size, generator, options);
+}
+inline Tensor dispatch_randint(int64_t high, IntList size, Tensor result) {
+  AutoNoGIL no_gil;
+  return at::randint_out(result, high, size);
+}
+inline Tensor dispatch_randint(int64_t high, IntList size, const TensorOptions & options) {
+  maybe_initialize_cuda(at::getType(options));
+  AutoNoGIL no_gil;
+  return torch::randint(high, size, options);
+}
+inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, Generator * generator, Tensor result) {
+  AutoNoGIL no_gil;
+  return at::randint_out(result, low, high, size, generator);
+}
+inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, Generator * generator, const TensorOptions & options) {
+  maybe_initialize_cuda(at::getType(options));
+  AutoNoGIL no_gil;
+  return torch::randint(low, high, size, generator, options);
+}
+inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, Tensor result) {
+  AutoNoGIL no_gil;
+  return at::randint_out(result, low, high, size);
+}
+inline Tensor dispatch_randint(int64_t low, int64_t high, IntList size, const TensorOptions & options) {
+  maybe_initialize_cuda(at::getType(options));
+  AutoNoGIL no_gil;
+  return torch::randint(low, high, size, options);
+}
+
+static PyObject * THPVariable_randint(PyObject* self_, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "randint(int64_t high, IntList size, *, Generator generator, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)",
+    "randint(int64_t high, IntList size, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)",
+    "randint(int64_t low, int64_t high, IntList size, *, Generator generator, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)",
+    "randint(int64_t low, int64_t high, IntList size, *, Tensor out=None, ScalarType dtype=None, Layout layout=torch.strided, Device device=None, bool requires_grad=False)",
+  }, /*traceable=*/false);
+
+  ParsedArgs<9> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.idx == 0) {
+    if (r.isNone(3)) {
+      auto high = r.toInt64(0);
+      auto size = r.intlist(1);
+      auto generator = r.generator(2);
+      // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
+      auto dtype = r.scalartypeWithDefault(4, at::ScalarType::Long);
+      auto device = r.device(6);
+      const auto options = TensorOptions()
+          .dtype(dtype)
+          .device(device)
+          .layout(r.layout(5).layout)
+          .requires_grad(r.toBool(7));
+      return wrap(dispatch_randint(high, size, generator, options));
+    } else {
+      check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4),
+                             r.layout(5), r.isNone(5),
+                             r.device(6), r.isNone(6));
+      return wrap(dispatch_randint(r.toInt64(0), r.intlist(1), r.generator(2), r.tensor(3)).set_requires_grad(r.toBool(7)));
+    }
+  } else if (r.idx == 1) {
+    if (r.isNone(2)) {
+      auto high = r.toInt64(0);
+      auto size = r.intlist(1);
+      // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
+      auto dtype = r.scalartypeWithDefault(3, at::ScalarType::Long);
+      auto device = r.device(5);
+      const auto options = TensorOptions()
+          .dtype(dtype)
+          .device(device)
+          .layout(r.layout(4).layout)
+          .requires_grad(r.toBool(6));
+      return wrap(dispatch_randint(high, size, options));
+    } else {
+      check_out_type_matches(r.tensor(2), r.scalartype(3), r.isNone(3),
+                             r.layout(4), r.isNone(4),
+                             r.device(5), r.isNone(5));
+      return wrap(dispatch_randint(r.toInt64(0), r.intlist(1), r.tensor(2)).set_requires_grad(r.toBool(6)));
+    }
+  } else if (r.idx == 2) {
+    if (r.isNone(4)) {
+      auto low = r.toInt64(0);
+      auto high = r.toInt64(1);
+      auto size = r.intlist(2);
+      auto generator = r.generator(3);
+      // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
+      auto dtype = r.scalartypeWithDefault(5, at::ScalarType::Long);
+      auto device = r.device(7);
+      const auto options = TensorOptions()
+          .dtype(dtype)
+          .device(device)
+          .layout(r.layout(6).layout)
+          .requires_grad(r.toBool(8));
+      return wrap(dispatch_randint(low, high, size, generator, options));
+    } else {
+      check_out_type_matches(r.tensor(4), r.scalartype(5), r.isNone(5),
+                             r.layout(6), r.isNone(6),
+                             r.device(7), r.isNone(7));
+      return wrap(dispatch_randint(r.toInt64(0), r.toInt64(1), r.intlist(2), r.generator(3), r.tensor(4)).set_requires_grad(r.toBool(8)));
+    }
+  } else if (r.idx == 3) {
+    if (r.isNone(3)) {
+      auto low = r.toInt64(0);
+      auto high = r.toInt64(1);
+      auto size = r.intlist(2);
+      // NOTE: r.scalartype(X) gives the default dtype if r.isNone(X)
+      auto dtype = r.scalartypeWithDefault(4, at::ScalarType::Long);
+      auto device = r.device(6);
+      const auto options = TensorOptions()
+          .dtype(dtype)
+          .device(device)
+          .layout(r.layout(5).layout)
+          .requires_grad(r.toBool(7));
+      return wrap(dispatch_randint(low, high, size, options));
+    } else {
+      check_out_type_matches(r.tensor(3), r.scalartype(4), r.isNone(4),
+                             r.layout(5), r.isNone(5),
+                             r.device(6), r.isNone(6));
+      return wrap(dispatch_randint(r.toInt64(0), r.toInt64(1), r.intlist(2), r.tensor(3)).set_requires_grad(r.toBool(7)));
+    }
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject * THPVariable_as_tensor(PyObject* self, PyObject* args, PyObject* kwargs)
 {
   HANDLE_TH_ERRORS
@@ -240,6 +373,7 @@ static PyMethodDef torch_functions[] = {
   {"from_numpy", (PyCFunction)THPVariable_from_numpy, METH_STATIC | METH_O, NULL},
   {"hsmm", (PyCFunction)THPVariable_hspmm, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
   {"_promote_types", (PyCFunction)THPVariable__promote_types, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
+  {"randint", (PyCFunction)THPVariable_randint, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
   {"range", (PyCFunction)THPVariable_range, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
   {"saddmm", (PyCFunction)THPVariable_sspaddmm, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
   {"sparse_coo_tensor", (PyCFunction)THPVariable_sparse_coo_tensor, METH_VARARGS | METH_KEYWORDS | METH_STATIC, NULL},
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index ea8869ed737aa..a863a18e04bdf 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -159,6 +159,15 @@ static double dispatch_to_CDouble(const Tensor & self) {
   return self.toCDouble();
 }
 
+static std::complex<double> dispatch_to_CComplexDouble(const Tensor & self) {
+  AutoNoGIL no_gil;
+  DeviceGuard device_guard(self);
+  if (self.numel() != 1) {
+    throw ValueError("only one element tensors can be converted to Python scalars");
+  }
+  return self.toCComplexDouble();
+}
+
 static int64_t dispatch_to_CLong(const Tensor & self) {
   AutoNoGIL no_gil;
   DeviceGuard device_guard(self);
@@ -365,6 +374,8 @@ static PyObject * THPVariable_item(PyObject* self, PyObject* args)
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
   if (self_.is_floating_point()) {
     return wrap(dispatch_to_CDouble(self_));
+  } else if (self_.is_complex()) {
+    return wrap(dispatch_to_CComplexDouble(self_));
   } else {
     return wrap(dispatch_to_CLong(self_));
   }
diff --git a/tools/build_pytorch_libs.bat b/tools/build_pytorch_libs.bat
index b122df45051e3..80d79dc436426 100755
--- a/tools/build_pytorch_libs.bat
+++ b/tools/build_pytorch_libs.bat
@@ -63,8 +63,9 @@ IF "%REL_WITH_DEB_INFO%"=="1" (
   set BUILD_TYPE=RelWithDebInfo
 )
 
+:: sccache will fail if all cores are used for compiling
 IF NOT DEFINED MAX_JOBS (
-  set MAX_JOBS=%NUMBER_OF_PROCESSORS%
+  set /a MAX_JOBS=%NUMBER_OF_PROCESSORS% - 1
 )
 
 IF NOT DEFINED BUILD_SHARED_LIBS (
@@ -151,6 +152,8 @@ goto:eof
                   -DTHNN_SO_VERSION=1 ^
                   -DTHCUNN_SO_VERSION=1 ^
                   -DUSE_CUDA=%USE_CUDA% ^
+                  -DBUILD_EXAMPLES=OFF ^
+                  -DBUILD_TEST=%BUILD_TEST% ^
                   -DNO_NNPACK=%NO_NNPACK% ^
                   -DCMAKE_BUILD_TYPE=%BUILD_TYPE%
 
@@ -174,11 +177,14 @@ goto:eof
                   -DNO_API=ON ^
                   -DBUILD_SHARED_LIBS="%BUILD_SHARED_LIBS%" ^
                   -DBUILD_PYTHON=OFF ^
-                  -DBUILD_BINARY=OFF ^
+                  -DBUILD_BINARY=%BUILD_BINARY% ^
+                  -DBUILD_TEST=OFF ^
+                  -DINSTALL_TEST=%INSTALL_TEST% ^
                   -DONNX_NAMESPACE=%ONNX_NAMESPACE% ^
                   -DUSE_CUDA=%USE_CUDA% ^
                   -DUSE_CUDNN=OFF ^
                   -DUSE_NNPACK=%USE_NNPACK% ^
+                  -DUSE_OPENCV=%USE_OPENCV% ^
                   -DUSE_GLOG=OFF ^
                   -DUSE_GFLAGS=OFF ^
                   -DUSE_SYSTEM_EIGEN_INSTALL=OFF ^
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index a8b0b9381d67c..ba0bde0d57734 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -16,7 +16,6 @@ USE_ROCM=0
 USE_NNPACK=0
 USE_MKLDNN=0
 USE_GLOO_IBVERBS=0
-FULL_CAFFE2=0
 CAFFE2_STATIC_LINK_CUDA=0
 while [[ $# -gt 0 ]]; do
     case "$1" in
@@ -35,9 +34,6 @@ while [[ $# -gt 0 ]]; do
       --use-gloo-ibverbs)
           USE_GLOO_IBVERBS=1
           ;;
-      --full-caffe2)
-          FULL_CAFFE2=1
-          ;;
       --cuda-static-link)
           CAFFE2_STATIC_LINK_CUDA=1
           ;;
@@ -104,11 +100,11 @@ else
     fi
 fi
 CPP_FLAGS=" -std=c++11 "
-GLOO_FLAGS=""
+GLOO_FLAGS="-DBUILD_TEST=OFF "
 THD_FLAGS=""
 NCCL_ROOT_DIR=${NCCL_ROOT_DIR:-$INSTALL_DIR}
 if [[ $USE_CUDA -eq 1 ]]; then
-    GLOO_FLAGS="-DUSE_CUDA=1 -DNCCL_ROOT_DIR=$NCCL_ROOT_DIR"
+    GLOO_FLAGS+="-DUSE_CUDA=1 -DNCCL_ROOT_DIR=$NCCL_ROOT_DIR"
 fi
 # Gloo infiniband support
 if [[ $USE_GLOO_IBVERBS -eq 1 ]]; then
@@ -183,6 +179,8 @@ function build() {
               -DTHCUNN_SO_VERSION=1 \
               -DTHD_SO_VERSION=1 \
               -DUSE_CUDA=$USE_CUDA \
+              -DBUILD_EXAMPLES=OFF \
+              -DBUILD_TEST=$BUILD_TEST \
               -DNO_NNPACK=$((1-$USE_NNPACK)) \
               -DNCCL_EXTERNAL=1 \
               -DCMAKE_DEBUG_POSTFIX="" \
@@ -260,16 +258,17 @@ function build_caffe2() {
       -DBUILDING_WITH_TORCH_LIBS=ON \
       -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
       -DBUILD_TORCH=$BUILD_TORCH \
-      -DBUILD_PYTHON=ON \
+      -DBUILD_PYTHON=$BUILD_PYTHON \
       -DBUILD_SHARED_LIBS=$BUILD_SHARED_LIBS \
-      -DBUILD_BINARY=$FULL_CAFFE2 \
-      -DBUILD_TEST=$FULL_CAFFE2 \
-      -DINSTALL_TEST=$FULL_CAFFE2 \
+      -DBUILD_BINARY=$BUILD_BINARY \
+      -DBUILD_TEST=$BUILD_TEST \
+      -DINSTALL_TEST=$INSTALL_TEST \
       -DONNX_NAMESPACE=$ONNX_NAMESPACE \
       -DUSE_CUDA=$USE_CUDA \
       -DCAFFE2_STATIC_LINK_CUDA=$CAFFE2_STATIC_LINK_CUDA \
       -DUSE_ROCM=$USE_ROCM \
       -DUSE_NNPACK=$USE_NNPACK \
+      -DUSE_OPENCV=$USE_OPENCV \
       -DUSE_GLOG=OFF \
       -DUSE_GFLAGS=OFF \
       -DUSE_SYSTEM_EIGEN_INSTALL=OFF \
diff --git a/tools/setup_helpers/build.py b/tools/setup_helpers/build.py
new file mode 100644
index 0000000000000..82364bd7394c5
--- /dev/null
+++ b/tools/setup_helpers/build.py
@@ -0,0 +1,5 @@
+from .env import check_env_flag, check_negative_env_flag
+
+BUILD_BINARY = check_env_flag('BUILD_BINARY')
+BUILD_TEST = not check_negative_env_flag('BUILD_TEST')
+USE_OPENCV = check_env_flag('USE_OPENCV')
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 1fc87e10be15a..d5d0ebc663915 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -176,7 +176,7 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/passes/shape_analysis.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/specialize_undef.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/register_prim_ops.cpp
+  ${TORCH_SRC_DIR}/csrc/jit/register_special_ops.cpp
   ${TORCH_SRC_DIR}/csrc/jit/register_symbols.cpp
   ${TORCH_SRC_DIR}/csrc/jit/script/compiler.cpp
   ${TORCH_SRC_DIR}/csrc/jit/script/lexer.cpp
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index ff4c070275353..f7ace5e0edcac 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -1317,6 +1317,13 @@ def callable(a, b) -> number
 See :func:`torch.masked_select`
 """)
 
+add_docstr_all('matrix_power',
+               r"""
+matrix_power(n) -> Tensor
+
+See :func:`torch.matrix_power`
+""")
+
 add_docstr_all('max',
                r"""
 max(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index b803428850ebd..ea6016f778f71 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -2385,6 +2385,38 @@ def parse_kwargs(desc):
     tensor(9)
 """)
 
+add_docstr(torch.matrix_power,
+           r"""
+matrix_power(input, n) -> Tensor
+
+Returns the matrix raised to the power :attr:`n` for square matrices.
+For batch of matrices, each individual matrix is raised to the power :attr:`n`.
+
+If :attr:`n` is negative, then the inverse of the matrix (if invertible) is
+raised to the power :attr:`n`. If :attr:`n` is 0, then an identity matrix
+is returned.
+
+Args:
+    input (Tensor): the input tensor
+    n (int): the power to raise the matrix to
+
+Example::
+
+    >>> a = torch.randn(2, 2, 2)
+    >>> a
+    tensor([[[-1.9975, -1.9610],
+             [ 0.9592, -2.3364]],
+
+            [[-1.2534, -1.3429],
+             [ 0.4153, -1.4664]]])
+    >>> torch.matrix_power(a, 3)
+    tensor([[[  3.9392, -23.9916],
+             [ 11.7357,  -0.2070]],
+
+            [[  0.2468,  -6.7168],
+             [  2.0774,  -0.8187]]])
+""")
+
 add_docstr(torch.max,
            r"""
 .. function:: max(input) -> Tensor
@@ -4409,6 +4441,10 @@ def parse_kwargs(desc):
 If :attr:`some` is ``True`` (default), the returned `U` and `V` matrices will
 contain only :math:`min(n, m)` orthonormal columns.
 
+.. note:: The implementation of SVD on CPU uses the LAPACK routine `?gesdd` (a divide-and-conquer
+          algorithm) instead of `?gesvd` for speed. Analogously, the SVD on GPU uses the MAGMA routine
+          `gesdd` as well.
+
 .. note:: Irrespective of the original strides, the returned matrix `U`
           will be transposed, i.e. with strides `(1, n)` instead of `(n, 1)`.
 
diff --git a/torch/_utils.py b/torch/_utils.py
index 60ca39000d5d4..4d43dee451386 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -33,9 +33,9 @@ def _type(self, dtype=None, non_blocking=False, **kwargs):
             raise RuntimeError("Cannot cast sparse tensor to dense tensor")
         new_module_name = dtype.__module__.replace('.sparse', '')
         new_values_type_name = new_module_name + '.' + dtype.__name__
-        new_values = self._values().type(new_values_type_name, non_blocking)
+        new_values = torch._values(self).type(new_values_type_name, non_blocking)
         new_indices_type_name = new_module_name + '.LongTensor'
-        new_indices = self._indices().type(new_indices_type_name, non_blocking)
+        new_indices = torch._indices(self).type(new_indices_type_name, non_blocking)
         return dtype(new_indices, new_values, self.size())
     if dtype.is_sparse:
         raise RuntimeError("Cannot cast dense tensor to sparse tensor")
@@ -68,8 +68,8 @@ def _cuda(self, device=None, non_blocking=False, **kwargs):
     with torch.cuda.device(device):
         if self.is_sparse:
             new_type = getattr(torch.cuda.sparse, self.__class__.__name__)
-            indices = self._indices().cuda(device, non_blocking)
-            values = self._values().cuda(device, non_blocking)
+            indices = torch._indices(self).cuda(device, non_blocking)
+            values = torch._values(self).cuda(device, non_blocking)
             return new_type(indices, values, self.size())
         else:
             new_type = getattr(torch.cuda, self.__class__.__name__)
@@ -163,8 +163,8 @@ def _flatten_sparse_tensors(tensors):
         A tuple of two contiguous 1D buffers, one containing input tensors'
         indices and the other containing the values.
     """
-    flat_indices = _flatten_dense_tensors([t._indices() for t in tensors])
-    flat_values = _flatten_dense_tensors([t._values() for t in tensors])
+    flat_indices = _flatten_dense_tensors([torch._indices(t) for t in tensors])
+    flat_values = _flatten_dense_tensors([torch._values(t) for t in tensors])
     return flat_indices, flat_values
 
 
@@ -206,8 +206,8 @@ def _unflatten_sparse_tensors(flat, tensors):
         flat.
     """
     flat_indices, flat_values = flat
-    indices = _unflatten_dense_tensors(flat_indices, [t._indices() for t in tensors])
-    values = _unflatten_dense_tensors(flat_values, [t._values() for t in tensors])
+    indices = _unflatten_dense_tensors(flat_indices, [torch._indices(t) for t in tensors])
+    values = _unflatten_dense_tensors(flat_values, [torch._values(t) for t in tensors])
     outputs = []
     for t, i, v in zip(tensors, indices, values):
         outputs.append(t.new(i, v, t.size()))
@@ -252,8 +252,8 @@ def _take_tensors(tensors, size_limit):
     for tensor in tensors:
         t = tensor.type()
         if tensor.is_sparse:
-            indices = tensor._indices()
-            values = tensor._values()
+            indices = torch._indices(tensor)
+            values = torch._values(tensor)
             size = indices.numel() * indices.element_size() + values.numel() * values.element_size()
         else:
             size = tensor.numel() * tensor.element_size()
diff --git a/torch/backends/cudnn/__init__.py b/torch/backends/cudnn/__init__.py
index 8d7542fdbc2f2..1c9910d06253e 100644
--- a/torch/backends/cudnn/__init__.py
+++ b/torch/backends/cudnn/__init__.py
@@ -26,7 +26,7 @@
 
 
 def find_cudnn_windows_lib():
-    proc = Popen(['where', 'cudnn64*.dll'], stdout=PIPE, stderr=PIPE)
+    proc = Popen(['where', 'cudnn64*.dll'], stdout=PIPE, stderr=PIPE, stdin=PIPE)
     out, err = proc.communicate()
     out = out.decode().strip()
     if len(out) > 0:
diff --git a/torch/csrc/api/include/torch/jit.h b/torch/csrc/api/include/torch/jit.h
index dfcf26dcda2ae..5a4b262101ab6 100644
--- a/torch/csrc/api/include/torch/jit.h
+++ b/torch/csrc/api/include/torch/jit.h
@@ -9,10 +9,15 @@
 namespace torch {
 namespace jit {
 
-/// Compiles Python JIT code into a graph that can be executed.
+/// Compiles script code into an executable graph.
+///
+/// Takes a string containing functions in script syntax and compiles them into
+/// a module (graph). The returned module provides a `run_method` function
+/// that may be used to invoke the compiled functions.
 ///
 /// For example:
-/// @code
+/// \rst
+/// .. code-block::
 ///   auto module = torch::jit::compile(R"JIT(
 ///     def relu_script(a, b):
 ///       return torch.relu(a + b)
@@ -23,11 +28,7 @@ namespace jit {
 ///       return a
 ///   )JIT");
 ///   IValue output = module->run_method("relu_script", a, b);
-/// @endcode
-///
-/// @param source A string containing functions containing script code to
-/// compile
-/// @return A module containing the compiled functions
+/// \endrst
 std::shared_ptr<script::Module> compile(const std::string& source);
 
 } // namespace jit
diff --git a/torch/csrc/api/include/torch/nn/module.h b/torch/csrc/api/include/torch/nn/module.h
index f3ea5f2282951..ccfa10a90ad2a 100644
--- a/torch/csrc/api/include/torch/nn/module.h
+++ b/torch/csrc/api/include/torch/nn/module.h
@@ -28,7 +28,6 @@ namespace nn {
 ///
 /// \rst
 /// .. note::
-///
 ///   The design and implementation of this class is largely based on the Python
 ///   API. You may want to consult [its
 ///   documentation](https://pytorch.org/docs/master/nn.html#torch.nn.Module)
@@ -219,13 +218,12 @@ class Module {
   /// This method is useful when calling `apply()` on a `ModuleCursor`.
   /// \rst
   /// .. code-block:: cpp
-  ///
-  /// void initialize_weights(nn::Module& module) {
-  ///   torch::NoGradGuard no_grad;
-  ///   if (auto* linear = module.as<nn::Linear>()) {
-  ///     linear->weight.normal_(0.0, 0.02);
+  ///   void initialize_weights(nn::Module& module) {
+  ///     torch::NoGradGuard no_grad;
+  ///     if (auto* linear = module.as<nn::Linear>()) {
+  ///       linear->weight.normal_(0.0, 0.02);
+  ///     }
   ///   }
-  /// }
   ///
   /// MyModule module;
   /// module->modules().apply(initialize_weights);
@@ -239,12 +237,12 @@ class Module {
   /// \rst
   /// .. code-block:: cpp
   ///
-  /// void initialize_weights(nn::Module& module) {
-  ///   torch::NoGradGuard no_grad;
-  ///   if (auto* linear = module.as<nn::Linear>()) {
-  ///     linear->weight.normal_(0.0, 0.02);
+  ///   void initialize_weights(nn::Module& module) {
+  ///     torch::NoGradGuard no_grad;
+  ///     if (auto* linear = module.as<nn::Linear>()) {
+  ///       linear->weight.normal_(0.0, 0.02);
+  ///     }
   ///   }
-  /// }
   ///
   /// MyModule module;
   /// module->modules().apply(initialize_weights);
@@ -263,9 +261,9 @@ class Module {
   ///
   /// \rst
   /// .. code-block: cpp
-  /// MyModule::MyModule() {
-  ///   weight_ = register_parameter("weight", torch::randn({A, B}));
-  /// }
+  ///   MyModule::MyModule() {
+  ///     weight_ = register_parameter("weight", torch::randn({A, B}));
+  ///   }
   /// \endrst
   Tensor& register_parameter(
       std::string name,
@@ -280,9 +278,9 @@ class Module {
   ///
   /// \rst
   /// .. code-block: cpp
-  /// MyModule::MyModule() {
-  ///   mean_ = register_buffer("mean", torch::empty({num_features_}));
-  /// }
+  ///   MyModule::MyModule() {
+  ///     mean_ = register_buffer("mean", torch::empty({num_features_}));
+  ///   }
   /// \endrst
   Tensor& register_buffer(std::string name, Tensor tensor);
 
@@ -293,9 +291,9 @@ class Module {
   ///
   /// \rst
   /// .. code-block: cpp
-  /// MyModule::MyModule() {
-  ///   submodule_ = register_module("linear", torch::nn::Linear(3, 4));
-  /// }
+  ///   MyModule::MyModule() {
+  ///     submodule_ = register_module("linear", torch::nn::Linear(3, 4));
+  ///   }
   /// \endrst
   template <typename ModuleType>
   std::shared_ptr<ModuleType> register_module(
@@ -311,9 +309,9 @@ class Module {
   ///
   /// \rst
   /// .. code-block: cpp
-  /// MyModule::MyModule() {
-  ///   submodule_ = register_module("linear", torch::nn::Linear(3, 4));
-  /// }
+  ///   MyModule::MyModule() {
+  ///     submodule_ = register_module("linear", torch::nn::Linear(3, 4));
+  ///   }
   /// \endrst
   template <typename ModuleType>
   std::shared_ptr<ModuleType> register_module(
diff --git a/torch/csrc/api/include/torch/nn/modules/dropout.h b/torch/csrc/api/include/torch/nn/modules/dropout.h
index 23c3e4f127d97..f547b5b2fa2d2 100644
--- a/torch/csrc/api/include/torch/nn/modules/dropout.h
+++ b/torch/csrc/api/include/torch/nn/modules/dropout.h
@@ -11,6 +11,8 @@ namespace torch {
 namespace nn {
 struct DropoutOptions {
   DropoutOptions(double rate);
+  /// The probability with which a particular component of the input is set to
+  /// zero.
   TORCH_ARG(double, rate) = 0.5;
 };
 
@@ -23,26 +25,50 @@ class DropoutImplBase : public torch::nn::Cloneable<Derived> {
   explicit DropoutImplBase(DropoutOptions options_);
 
   void reset() override;
+
+  /// During training, applies a noise mask to the input tensor.
+  /// During evaluation, applies an identity function.
   Tensor forward(Tensor input);
+
+  /// Returns a noise mask that can be applied to the given input tensor.
+  /// Used inside `forward()` to generate the noise mask for dropout.
   virtual Tensor noise_mask(Tensor input) const = 0;
 
   DropoutOptions options;
 };
 } // namespace detail
 
+/// Applies [Dropout](https://arxiv.org/abs/1207.0580) during training.
+///
+/// See https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout to learn more
+/// about the exact semantics of this module.
 class DropoutImpl : public detail::DropoutImplBase<DropoutImpl> {
  public:
   using detail::DropoutImplBase<DropoutImpl>::DropoutImplBase;
   Tensor noise_mask(Tensor input) const override;
 };
 
+/// Applies [Dropout](https://arxiv.org/abs/1207.0580) to inputs with
+/// 2-dimensional features.
+///
+/// See https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout2d to learn more
+/// about the exact semantics of this module.
 class Dropout2dImpl : public detail::DropoutImplBase<Dropout2dImpl> {
  public:
   using detail::DropoutImplBase<Dropout2dImpl>::DropoutImplBase;
   Tensor noise_mask(Tensor input) const override;
 };
 
+/// A `ModuleHolder` subclass for `DropoutImpl`.
+/// See the documentation for `DropoutImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(Dropout);
+
+/// A `ModuleHolder` subclass for `Dropout2dImpl`.
+/// See the documentation for `Dropout2dImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
 TORCH_MODULE(Dropout2d);
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/src/nn/init.cpp b/torch/csrc/api/src/nn/init.cpp
index 2dbfe78444f12..1afc1858b2aa9 100644
--- a/torch/csrc/api/src/nn/init.cpp
+++ b/torch/csrc/api/src/nn/init.cpp
@@ -20,7 +20,7 @@ struct Fan {
     const auto dimensions = tensor.ndimension();
     AT_CHECK(
         dimensions >= 2,
-        "Fan in and fan out can not be computed for tensor with less than 2 dimensions");
+        "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions");
 
     if (dimensions == 2) {
       in = tensor.size(1);
diff --git a/torch/csrc/api/src/optim/lbfgs.cpp b/torch/csrc/api/src/optim/lbfgs.cpp
index e124802622ae1..7e9c1df9dd45e 100644
--- a/torch/csrc/api/src/optim/lbfgs.cpp
+++ b/torch/csrc/api/src/optim/lbfgs.cpp
@@ -88,7 +88,7 @@ torch::Tensor LBFGS::step(LossClosure closure) {
       Tensor q = flat_grad.neg();
       for (int64_t i = num_old - 1; i >= 0; i--) {
         al.at(i) = old_stps.at(i).dot(q) * ro.at(i);
-        q.add_(old_dirs.at(i), -al.at(i)._local_scalar());
+        q.add_(old_dirs.at(i), -at::_local_scalar(al.at(i)));
       }
 
       // Multiply by initial Hessian
@@ -98,7 +98,7 @@ torch::Tensor LBFGS::step(LossClosure closure) {
 
       for (int64_t i = 0; i < num_old; i++) {
         Tensor be_i = old_dirs.at(i).dot(r) * ro.at(i);
-        r.add_(old_stps.at(i), (al.at(i) - be_i)._local_scalar());
+        r.add_(old_stps.at(i), at::_local_scalar(al.at(i) - be_i));
       }
       prev_flat_grad.copy_(flat_grad);
     }
@@ -109,7 +109,7 @@ torch::Tensor LBFGS::step(LossClosure closure) {
 
     // reset initial guess for step size
     if (n_iter == 1) {
-      t =  (at::min(ONE, ONE / abs_grad_sum) * options.learning_rate_)._local_scalar();
+      t = at::_local_scalar(at::min(ONE, ONE / abs_grad_sum) * options.learning_rate_);
     } else {
       t = options.learning_rate_;
     }
diff --git a/torch/csrc/autograd/functions/accumulate_grad.cpp b/torch/csrc/autograd/functions/accumulate_grad.cpp
index fd24f6987642b..221a28f3d1e26 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.cpp
+++ b/torch/csrc/autograd/functions/accumulate_grad.cpp
@@ -33,14 +33,26 @@ auto AccumulateGrad::apply(variable_list&& grads) -> variable_list {
   if (!variable.requires_grad())
     return {};
 
-  auto new_grad = grads[0];
+  auto new_grad = std::move(grads[0]);
   for (auto& hook : variable.hooks()) {
     new_grad = (*hook)({new_grad})[0];
   }
 
   at::Tensor& grad = variable.grad();
   if (!grad.defined()) {
-    variable.grad() = new_grad.clone();
+    // under following condition, we can avoid clone()
+    if (!GradMode::is_enabled()
+        && !new_grad.type().is_sparse()
+        && new_grad.is_contiguous()
+        && new_grad.use_count() == 1) {
+      // first check it is in first-order grad only mode
+      // then check not sparse before is_contiguous
+      // then check contiguous, otherwise later in place accumulation may fail
+      // and lastly, check it is the last reference before we grab it
+      variable.grad() = new_grad.detach();
+    } else {
+      variable.grad() = new_grad.clone();
+    }
   } else if (!GradMode::is_enabled()) {
     Variable& grad_variable = as_variable_ref(grad);
     // This case is not strictly necessary, but it makes the first-order only case
diff --git a/torch/csrc/autograd/utils/wrap_outputs.h b/torch/csrc/autograd/utils/wrap_outputs.h
index 1417956eb1840..40ef7ce25f8dd 100644
--- a/torch/csrc/autograd/utils/wrap_outputs.h
+++ b/torch/csrc/autograd/utils/wrap_outputs.h
@@ -81,6 +81,12 @@ inline PyObject* wrap(double value) {
   return PyFloat_FromDouble(value);
 }
 
+inline PyObject* wrap(std::complex<double> value) {
+  // I could probably also use FromComplex with a reinterpret cast,
+  // but... eh.
+  return PyComplex_FromDoubles(value.real(), value.imag());
+}
+
 inline PyObject* wrap(void* value) {
   return THPUtils_packInt64(reinterpret_cast<intptr_t>(value));
 }
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index 6ec348dc01a0d..a3bfd2f7749a6 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -22,7 +22,7 @@
 namespace torch {
 namespace autograd {
 Variable::Impl::Impl(at::Tensor data, bool requires_grad, Edge gradient_edge)
-    : TensorImpl(data.type().type_id(), data.type().scalarType(), /* is variable */ true),
+    : TensorImpl(data.type().type_id(), data.type().scalarType(), data.type().allocator(), /* is variable */ true),
       data_(std::move(data)),
       grad_fn_(std::move(gradient_edge.function)),
       requires_grad_(false),
@@ -113,7 +113,7 @@ std::shared_ptr<Function> Variable::Impl::get_grad_accumulator() {
   return result;
 }
 
-Tensor Variable::Impl::detach() const {
+Variable Variable::Impl::detach() const {
   auto detached = make_variable(data_, /*requires_grad=*/false);
   detached.set_version_counter(version_counter_);
   return detached;
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index a1dce7e6c792a..5a59afda5a864 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -59,7 +59,11 @@ struct Function;
 /// which case it tracks that `Variable`'s data and autograd history. Beyond
 /// construction, the interface of a view is identical to that of a regular
 /// `Variable`. You can determine whether `Variable` is in fact a view by
-/// probing its `is_view()` method.
+/// probing its `is_view()` method. Note that the *view* semantics are only
+/// meaningful for `Variable` relations that are relevant to autograd. For
+/// example, if you hide your code from autograd using `.data`, the `Variable`s
+/// will not be registered as having view relations, even if they share storage.
+///
 ///
 ///                               Interface
 ///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -173,6 +177,22 @@ struct TORCH_API Variable : public at::Tensor {
     }
   }
 
+  /// Returns a copy of this `Variable` that is detached from its autograd graph
+  /// and has a blank version. This method is OK to call if the `Variable` is a
+  /// view.
+  Variable detach() const;
+
+  /// Like `detach()`, but removes this `Variable` in-place. This method may
+  /// only be called on non-view `Variable`s. You can use `is_view()` to check
+  /// this. If this `Variable` is a view, throws an `std::runtime_error()`.
+  void detach_();
+
+  /// Computes the gradient of current tensor w.r.t. graph leaves.
+  void backward(at::optional<Tensor> gradient, bool keep_graph, bool create_graph) const;
+
+  /// Sets the type of the Variable.
+  void set_data(Tensor new_data) const;
+
   /// Set the gradient edge -- i.e. `grad_fn` and `input_nr` -- of the
   /// `Variable`.
   /// NOTE: This will always set the `grad_fn`, even if this is a leaf variable,
@@ -307,24 +327,15 @@ struct TORCH_API Variable::Impl : public at::TensorImpl {
     return grad_;
   }
 
-  /// Returns a copy of this `Variable` that is detached from its autograd graph
-  /// and has a blank version. This method is OK to call if the `Variable` is a
-  /// view.
-  Tensor detach() const override;
-
-  /// Like `detach()`, but removes this `Variable` in-place. This method may
-  /// only be called on non-view `Variable`s. You can use `is_view()` to check
-  /// this. If this `Variable` is a view, throws an `std::runtime_error()`.
-  void detach_() override;
+  Variable detach() const;
+  void detach_();
 
-  /// Sets the type of the Variable.
-  void set_data(Tensor new_data) override;
+  void set_data(Tensor new_data);
 
-  /// Computes the gradient of current tensor w.r.t. graph leaves.
   void backward(
       at::optional<at::Tensor> gradient,
       bool keep_graph,
-      bool create_graph) override;
+      bool create_graph);
 
   /// Reset all expensive fields to free up resources
   void release_resources() override;
@@ -485,6 +496,22 @@ inline std::shared_ptr<Function> Variable::grad_accumulator() const {
   return get()->get_grad_accumulator();
 }
 
+inline Variable Variable::detach() const {
+  return get()->detach();
+}
+
+inline void Variable::detach_() {
+  get()->detach_();
+}
+
+inline void Variable::backward(at::optional<Tensor> gradient, bool keep_graph, bool create_graph) const {
+  get()->backward(gradient, keep_graph, create_graph);
+}
+
+inline void Variable::set_data(Tensor new_data) const {
+  get()->set_data(new_data);
+}
+
 inline void Variable::set_gradient_edge(Edge edge) noexcept {
   get()->grad_fn_ = std::move(edge.function);
   get()->output_nr_ = edge.input_nr;
diff --git a/torch/csrc/distributed/c10d/ddp.cpp b/torch/csrc/distributed/c10d/ddp.cpp
new file mode 100644
index 0000000000000..34a63ab613969
--- /dev/null
+++ b/torch/csrc/distributed/c10d/ddp.cpp
@@ -0,0 +1,104 @@
+#include <torch/csrc/distributed/c10d/ddp.h>
+
+#include <torch/csrc/cuda/comm.h>
+#include <torch/csrc/utils/tensor_flatten.h>
+
+#include <c10d/ProcessGroup.hpp>
+
+#include <ATen/ATen.h>
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+namespace c10d {
+namespace {
+/// For every replica except the root, copy the data from `broadcastTensors`
+/// to `replicaData`.
+void copyBroadcastTensorsToReplicas(
+    const std::vector<std::vector<at::Tensor>>& broadcastTensors,
+    std::vector<std::vector<at::Tensor>>& replicaData) {
+  AT_ASSERT(replicaData.size() == broadcastTensors.size());
+  // replica = 1 means we skip the root (replica 0).
+  for (size_t replica = 1; replica < replicaData.size(); ++replica) {
+    AT_ASSERT(replicaData[replica].size() == broadcastTensors[replica].size());
+    for (size_t tensor = 0; tensor < replicaData[replica].size(); ++tensor) {
+      replicaData[replica][tensor].set_(broadcastTensors[replica][tensor]);
+    }
+  }
+}
+} // namespace
+
+void distBroadcastCoalesced(
+    std::vector<at::Tensor>& tensors,
+    int64_t bufferSize,
+    ProcessGroup& processGroup) {
+  auto tensorGroups = torch::utils::take_tensors(tensors, bufferSize);
+  // We store single-element vectors in `flatTensors` because
+  // `ProcessGroup::broadcast` takes a reference to a vector, which must be
+  // alive until the `wait()` call on the returned `Work` completes.
+  std::vector<std::vector<at::Tensor>> flatTensors;
+  std::vector<std::shared_ptr<ProcessGroup::Work>> work;
+  flatTensors.reserve(tensorGroups.size());
+  work.reserve(tensorGroups.size());
+  for (const auto& group : tensorGroups) {
+    // Flatten each group of tensors (whose size equals `bufferSize`) into a
+    // single tensor.
+    flatTensors.push_back({torch::utils::flatten_dense_tensors(group.tensors)});
+    BroadcastOptions broadcastOptions;
+    broadcastOptions.rootRank = 0;
+    broadcastOptions.rootTensor = 0;
+    // Enqueue a work item and collect the `Work` (essentially a "future") so we
+    // can `wait()` for its completion after we have collected all `Work` items.
+    work.push_back(
+        processGroup.broadcast(flatTensors.back(), broadcastOptions));
+  }
+  // Now loop through each group, wait for the broadcast to complete, and
+  // un-flatten the broadcast tensor back into device-local individual tensors.
+  for (size_t group = 0; group < tensorGroups.size(); ++group) {
+    auto& tensors = tensorGroups[group].tensors;
+    work[group]->wait();
+    const auto synced =
+        torch::utils::unflatten_dense_tensors(flatTensors[group][0], tensors);
+    AT_ASSERT(synced.size() == tensors.size());
+    for (size_t i = 0; i < synced.size(); ++i) {
+      // Copy into the per-process tensors.
+      tensors[i].copy_(synced[i], /*non_blocking=*/true);
+    }
+  }
+}
+
+void syncParams(
+    ProcessGroup& processGroup,
+    std::vector<std::vector<at::Tensor>>& parameterData,
+    std::vector<std::vector<at::Tensor>>& bufferData,
+    const std::vector<int64_t>& devices,
+    int64_t broadcastBucketSize,
+    bool broadcastBuffers) {
+  AT_ASSERT(!parameterData.empty());
+  AT_ASSERT(!bufferData.empty());
+  AT_ASSERT(!devices.empty());
+
+  // Do an intra-node sync if we have more than one device.
+  if (devices.size() > 1) {
+    // Broadcast the parameters, get back a vector<vector<Tensor>>, one
+    // vector<Tensor> per device. Each such vector then needs to be copied into
+    // the `parameterData` of every step.
+    auto result = torch::cuda::broadcast_coalesced(
+        parameterData[0], devices, broadcastBucketSize);
+    copyBroadcastTensorsToReplicas(result, parameterData);
+  }
+
+  if (broadcastBuffers && !bufferData[0].empty()) {
+    // Do an inter-node sync first.
+    distBroadcastCoalesced(bufferData[0], broadcastBucketSize, processGroup);
+    // Then an intra-node sync if we have more than one device.
+    if (devices.size() > 1) {
+      auto result = torch::cuda::broadcast_coalesced(
+          bufferData[0], devices, broadcastBucketSize);
+      copyBroadcastTensorsToReplicas(result, bufferData);
+    }
+  }
+}
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ddp.h b/torch/csrc/distributed/c10d/ddp.h
index 7b26c1475fc1c..f3aa61d6017bc 100644
--- a/torch/csrc/distributed/c10d/ddp.h
+++ b/torch/csrc/distributed/c10d/ddp.h
@@ -1,9 +1,5 @@
 #pragma once
 
-#include <torch/csrc/utils/tensor_flatten.h>
-
-#include <c10d/ProcessGroup.hpp>
-
 #include <ATen/ATen.h>
 
 #include <cstddef>
@@ -11,42 +7,20 @@
 #include <vector>
 
 namespace c10d {
-inline void distBroadcastCoalesced(
+class ProcessGroup;
+} // namespace c10d
+
+namespace c10d {
+void distBroadcastCoalesced(
     std::vector<at::Tensor>& tensors,
     int64_t bufferSize,
-    ProcessGroup& processGroup) {
-  auto tensorGroups = torch::utils::take_tensors(tensors, bufferSize);
-  // We store single-element vectors in `flatTensors` because
-  // `ProcessGroup::broadcast` takes a reference to a vector, which must be
-  // alive until the `wait()` call on the returned `Work` completes.
-  std::vector<std::vector<at::Tensor>> flatTensors;
-  std::vector<std::shared_ptr<ProcessGroup::Work>> work;
-  flatTensors.reserve(tensorGroups.size());
-  work.reserve(tensorGroups.size());
-  for (const auto& group : tensorGroups) {
-    // Flatten each group of tensors (whose size equals `bufferSize`) into a
-    // single tensor.
-    flatTensors.push_back({torch::utils::flatten_dense_tensors(group.tensors)});
-    BroadcastOptions broadcastOptions;
-    broadcastOptions.rootRank = 0;
-    broadcastOptions.rootTensor = 0;
-    // Enqueue a work item and collect the `Work` (essntially a "future") so we
-    // can `wait()` for its completion after we have collected all `Work` items.
-    work.push_back(
-        processGroup.broadcast(flatTensors.back(), broadcastOptions));
-  }
-  // Now loop through each group, wait for the broadcast to complete, and
-  // un-flatten the broadcast tensor back into device-local individual tensors.
-  for (size_t group = 0; group < tensorGroups.size(); ++group) {
-    auto& tensors = tensorGroups[group].tensors;
-    work[group]->wait();
-    const auto synced =
-        torch::utils::unflatten_dense_tensors(flatTensors[group][0], tensors);
-    AT_ASSERT(synced.size() == tensors.size());
-    for (size_t i = 0; i < synced.size(); ++i) {
-      // Copy into the per-process tensors.
-      tensors[i].copy_(synced[i], /*non_blocking=*/true);
-    }
-  }
-}
+    ProcessGroup& processGroup);
+
+void syncParams(
+    ProcessGroup& processGroup,
+    std::vector<std::vector<at::Tensor>>& parameterData,
+    std::vector<std::vector<at::Tensor>>& bufferData,
+    const std::vector<int64_t>& devices,
+    int64_t broadcastBucketSize,
+    bool broadcastBuffers);
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 10cd00976ee75..021e5e01247d1 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -375,7 +375,23 @@ PyObject* c10d_init(PyObject* _unused) {
           &::c10d::ProcessGroup::Work::wait,
           py::call_guard<py::gil_scoped_release>());
 
-  module.def("_dist_broadcast_coalesced", &::c10d::distBroadcastCoalesced);
+  module.def(
+      "_dist_broadcast_coalesced",
+      &::c10d::distBroadcastCoalesced,
+      py::arg("tensors"),
+      py::arg("buffer_size"),
+      py::arg("process_group"),
+      py::call_guard<py::gil_scoped_release>());
+  module.def(
+      "_sync_params",
+      &::c10d::syncParams,
+      py::arg("process_group"),
+      py::arg("parameter_data"),
+      py::arg("buffer_data"),
+      py::arg("devices"),
+      py::arg("broadcast_bucket_size"),
+      py::arg("broadcast_buffers"),
+      py::call_guard<py::gil_scoped_release>());
 
   Py_RETURN_TRUE;
 }
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index 98141d3382939..e5e6cf1960df6 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -6,6 +6,7 @@
 
 #include "torch/csrc/utils/functional.h"
 #include <torch/csrc/jit/assertions.h>
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
 
 #include <ATen/ATen.h>
 #include <ATen/core/optional.h>
@@ -43,11 +44,14 @@ std::string getNodeStackTraceString(const Node* n) {
   return ss.str();
 }
 
-void validateGraph(const std::shared_ptr<Graph>& graph, onnx_torch::OperatorExportTypes operator_export_type) {
-  for (auto node : graph->nodes()) {
-      // Macro'ed so we get a marginally better line number on failed export
+void validateBlock(Block *b, onnx_torch::OperatorExportTypes operator_export_type) {
+  for (auto node : b->nodes()) {
+    for (Block *sub_block : node->blocks()) {
+      validateBlock(sub_block, operator_export_type);
+    }
+    // Macro'ed so we get a marginally better line number on failed export
 #define FAIL_EXPORT(name) \
-      throw std::runtime_error(std::string("ONNX export failed: ") + name + "\n\nGraph we tried to export:\n" + graph->toString());
+      throw std::runtime_error(std::string("ONNX export failed: ") + name + "\n\nGraph we tried to export:\n" + b->owningGraph()->toString());
     IR_IF(node, PythonOp)
       auto py_node = static_cast<torch::jit::PythonOp*>(value);
       FAIL_EXPORT(
@@ -56,9 +60,19 @@ void validateGraph(const std::shared_ptr<Graph>& graph, onnx_torch::OperatorExpo
     IR_ELSE()
       // Special error messages for certain types of operators
       if (node->kind() == aten::expand) {
-        FAIL_EXPORT(
-            "Could not export a broadcasted operation; ONNX likely does not support this form of broadcasting.\n\nBroadcast occurred at:\n" +
-            getNodeStackTraceString(node));
+        if (operator_export_type == onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK) {
+          WithInsertPoint guard(node);
+          auto* new_node = b->owningGraph()->insertNode(
+            b->owningGraph()->create(Symbol(::torch::jit::onnx::ATen), node->inputs(), node->outputs().size()));
+          for (size_t i = 0; i < node->outputs().size(); ++i) {
+            node->output(i)->replaceAllUsesWith(new_node->output(i));
+          }
+          new_node->s_(Symbol::fromQualString("attr::operator"), "expand");
+        } else {
+          FAIL_EXPORT(
+              "Could not export a broadcasted operation; ONNX likely does not support this form of broadcasting.\n\nBroadcast occurred at:\n" +
+              getNodeStackTraceString(node));
+        }
       }
       if (node->kind() == prim::PackPadded || node->kind() == prim::PadPacked) {
         FAIL_EXPORT(
@@ -76,6 +90,11 @@ void validateGraph(const std::shared_ptr<Graph>& graph, onnx_torch::OperatorExpo
   }
 }
 
+void validateGraph(const std::shared_ptr<Graph>& graph, onnx_torch::OperatorExportTypes operator_export_type) {
+  validateBlock(graph->block(), operator_export_type);
+  EliminateDeadCode(graph);
+}
+
 class EncoderBase {
  public:
   EncoderBase(onnx_torch::OperatorExportTypes operator_export_type, bool strip_doc);
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index eb4956087c6c1..ce472707319fb 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -254,6 +254,31 @@ void initJITBindings(PyObject *module) {
     }
   }, py::arg("qualified_name"));
 
+  py::class_<FunctionSchema>(m, "FunctionSchema")
+  .def_property_readonly("name", [](FunctionSchema& self) { return self.name; })
+  .def_property_readonly("arguments", [](FunctionSchema& self) { return self.arguments; })
+  .def_property_readonly("returns", [](FunctionSchema& self) { return self.returns; });
+  py::class_<Argument>(m, "Argument")
+  .def_property_readonly("name", [](Argument& self) { return self.name; })
+  .def_property_readonly("type", [](Argument& self) { return self.type; })
+  .def_property_readonly("N", [](Argument& self) -> py::object {
+    return (self.N) ? py::cast(*self.N) :  py::none();
+  })
+  .def_property_readonly("default_value", [](Argument& self) -> py::object {
+    if(!self.default_value)
+      return py::none();
+    IValue v = *self.default_value;
+    return toPyObject(std::move(v));
+  });
+  m.def("_jit_get_schemas_for_operator", [](const std::string& qualified_name) {
+    auto symbol = Symbol::fromQualString(qualified_name);
+    auto operations = getAllOperatorsFor(std::move(symbol));
+    return fmap(operations, [](const std::shared_ptr<Operator>& op) {
+        return op->schema();
+      });
+  });
+
+
   initPythonIRBindings(module);
   tracer::initPythonTracerBindings(module);
   script::initTreeViewBindings(module);
diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h
index 32216971834e8..6d8f409fa3a2c 100644
--- a/torch/csrc/jit/interned_strings.h
+++ b/torch/csrc/jit/interned_strings.h
@@ -86,6 +86,7 @@ namespace torch { namespace jit {
   _(onnx, Greater)                 \
   _(onnx, Less)                    \
   _(onnx, Not)                     \
+  _(onnx, ATen)                    \
   FORALL_ATTR_BASE_SYMBOLS(_)      \
   _(attr, Subgraph)                \
   _(attr, ReverseSubgraph)         \
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index 1132b5cd3f562..6864d8ad79708 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -337,7 +337,7 @@ struct PreprocessGraph {
 struct ContainerTensor : public at::TensorImpl {
 public:
   ContainerTensor()
-  : TensorImpl(at::UndefinedTensorId(), at::ScalarType::Undefined, /* is_variable */ false) {}
+  : TensorImpl(at::UndefinedTensorId(), at::ScalarType::Undefined, nullptr, /* is_variable */ false) {}
 
   virtual ~ContainerTensor() = default;
   virtual at::IntList sizes() const override {
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index 9053b45e2e423..a65a743ddb24c 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -135,7 +135,7 @@ void printAttributes(std::ostream & out, const Node * n, bool ignore_subgraph=fa
           at::Tensor t = n->t(name);
           // 1-elem tensors are usually boxed scalars, so print them like it
           if (t.numel() == 1) {
-            auto scalar_tensor = t.view({})._local_scalar();
+            auto scalar_tensor = at::_local_scalar(t.view({}));
             out << "{";
             if (scalar_tensor.isFloatingPoint()) {
               out << scalar_tensor.toDouble();
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index f7285e2e3d30a..4b76872cb3f96 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -78,6 +78,19 @@ inline IValue createGenericList(py::handle obj, const TypePtr& elem_type) {
   return ConstantList<IValue>::create(std::move(elems));
 }
 
+struct ConvertError : public std::exception {
+    ConvertError(std::string msg)
+    : msg_(std::move(msg)) {}
+    const char* what() const noexcept override  {
+        return msg_.c_str();
+    }
+private:
+    std::string msg_;
+};
+
+#define TORCH_CONVERT_ERROR(...) \
+  throw ConvertError(at::str(__VA_ARGS__))
+
 inline IValue toIValue(py::handle obj, const TypePtr& type) {
     switch (type->kind()) {
       case TypeKind::DynamicType:
@@ -95,7 +108,7 @@ inline IValue toIValue(py::handle obj, const TypePtr& type) {
         size_t tuple_size = tuple.size();
         const auto & elem_types = type->cast<TupleType>()->elements();
         if (elem_types.size() != tuple_size) {
-          AT_ERROR("Expected ", elem_types.size(), " tuple elements for argument, but got ", tuple_size);
+          TORCH_CONVERT_ERROR("Expected ", elem_types.size(), " tuple elements for argument, but got ", tuple_size);
         }
         std::vector<IValue> values;
         values.reserve(tuple_size);
@@ -121,9 +134,9 @@ inline IValue toIValue(py::handle obj, const TypePtr& type) {
         }
       }
       case TypeKind::NumberType:
-        AT_ERROR("Insufficient type information to convert input");
+        TORCH_CONVERT_ERROR("Insufficient type information to convert input");
       case TypeKind::GeneratorType:
-        AT_ERROR("Generators are not supported yet.");
+        TORCH_CONVERT_ERROR("Generators are not supported yet.");
     }
   AT_ERROR("Missing cases in toIValue! File a bug report.");
 }
@@ -142,7 +155,15 @@ inline IValue argumentToIValue(
         "' in position ", argumentPosition,
         ", but instead got value of type ",
         py::str(object.get_type().attr("__name__")),
-        ". Declaration: ", schema);
+        ".\nDeclaration: ", schema);
+  } catch (const ConvertError& error) {
+    AT_ERROR(
+        schema.name, "(): ", error.what(),
+        "\n for argument '", argument.name,
+        "' in position ", argumentPosition,
+        ", but instead got value of type ",
+        py::str(object.get_type().attr("__name__")),
+        ".\nDeclaration: ", schema);
   }
 }
 
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index 8627fb1e904eb..416de46f1e695 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -438,7 +438,7 @@ void initPythonIRBindings(PyObject * module_) {
 
   py::class_<Type,std::shared_ptr<Type>>(m,"Type")
     .def("__repr__",[](Type & t) {
-      return t.str();
+      return t.python_str();
     })
     .def("kind",[](Type& t_) {
       Type * t = &t_;
diff --git a/torch/csrc/jit/register_special_ops.cpp b/torch/csrc/jit/register_special_ops.cpp
new file mode 100644
index 0000000000000..f199b8ff82ec4
--- /dev/null
+++ b/torch/csrc/jit/register_special_ops.cpp
@@ -0,0 +1,30 @@
+#include "torch/csrc/autograd/profiler.h"
+#include "torch/csrc/jit/custom_operator.h"
+#include "torch/csrc/jit/operator.h"
+
+namespace torch {
+namespace jit {
+
+namespace {
+RegisterOperators reg({
+    Operator(
+        "aten::split(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]",
+        [](Stack& stack) {
+          autograd::profiler::RecordFunction record("split_with_sizes");
+          auto result = at::split_with_sizes(
+              (std::move(peek(stack, 0, 3))).toTensor(),
+              (std::move(peek(stack, 1, 3))).toIntList()->elements(),
+              (std::move(peek(stack, 2, 3))).toInt());
+          drop(stack, 3);
+          pack(stack, std::move(result));
+          return 0;
+        }),
+    Operator(
+        "aten::Size(int[] sizes) -> int[]",
+        [](Stack& stack) {
+          return 0;
+        }),
+});
+}
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h
index ac37dcd185764..73adda456c39f 100644
--- a/torch/csrc/jit/tracer.h
+++ b/torch/csrc/jit/tracer.h
@@ -5,13 +5,13 @@
 #include "torch/csrc/jit/assertions.h"
 #include "torch/csrc/jit/constants.h"
 #include "torch/csrc/jit/stack.h"
+#include "torch/csrc/jit/tracing_state.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/utils/functional.h"
 #include "torch/csrc/utils/functional.h"
 #include "torch/csrc/utils/variadic.h"
 #include "torch/csrc/utils/variadic.h"
 #include "torch/csrc/WindowsTorchApiMacro.h"
-
 #include <ATen/Backtrace.h>
 
 #include <memory>
@@ -29,75 +29,6 @@ using variable_list = std::vector<Variable>;
 TORCH_API void recordSourceLocation(Node* n);
 TORCH_API void setRecordSourceLocation(void (*v)(Node*));
 
-struct TORCH_API TracingState : public std::enable_shared_from_this<TracingState> {
-  TracingState();
-  ~TracingState();
-
-  using WeakTensor = at::WeakTensor;
-
-  struct WeakTensorHasher {
-    size_t operator()(const WeakTensor& t) const {
-      return std::hash<void*>()(t.unsafeGetTensorImpl());
-    }
-  };
-
-  struct WeakTensorEq {
-    bool operator()(const WeakTensor& t1, const WeakTensor& t2) const {
-      return t1.is_same(t2);
-    }
-  };
-
-  std::unordered_map<WeakTensor, Value*, WeakTensorHasher, WeakTensorEq> value_map;
-  std::shared_ptr<Graph> graph;
-  bool warn = true;
-};
-
-
-// This is meant to be used as a thread local place, where we can store extra
-// info that gets lost when we call into ATen from Python bindings. One example
-// for when this happens is when we get an IntList argument with e.g. sizes for
-// view. When tracing, those might be tensors, which let us encode extra data
-// dependencies, but once they get to the ATen call where we actually have the
-// tracing logic, they get converted into a raw IntList, and we loose all
-// information. To prevent this, we temporarily stash it in here.
-struct ArgumentStash {
-  struct IntListTrace : std::vector<Value*> {
-    IntListTrace(int size)
-      : std::vector<Value*>(size, nullptr) {}
-  };
-
-  static bool empty() {
-    return stash.intlists.empty();
-  }
-
-  TORCH_API static void stashIntListElem(const std::string& arg_name,
-                                         size_t size,
-                                         size_t idx,
-                                         const Variable& var);
-
-  static bool hasIntList(const std::string& arg_name) {
-    return stash.intlists.count(arg_name) > 0;
-  }
-
-  static IntListTrace popIntList(const std::string& arg_name) {
-    auto info = std::move(stash.intlists.at(arg_name));
-    stash.intlists.erase(arg_name);
-    return info;
-  }
-
-private:
-  static thread_local ArgumentStash stash;
-  std::unordered_map<std::string, IntListTrace> intlists;
-};
-
-// Retrieve or set the current tracing state. Returns a nullptr if tracing is disabled.
-TORCH_API const std::shared_ptr<TracingState>& getTracingState();
-TORCH_API void setTracingState(std::shared_ptr<TracingState> state);
-
-inline bool isTracing() {
-  return static_cast<bool>(getTracingState());
-}
-
 // Having finished adding a new 'node' to the graph IR 'setValueTrace' associates
 // this node with an output variable, so that further operations involving this
 // variable know which node in the IR to reference.
@@ -264,31 +195,4 @@ TORCH_API void addOutput(Node* node, const std::vector<at::Tensor>& list);
 
 TORCH_API autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim);
 
-using warn_fn_type = void (*)(const std::string& msg);
-TORCH_API void _do_warn(const char * _reason);
-inline void warn(const char * _reason) {
-  if (auto state = getTracingState()) {
-    if (!state->warn) return;
-    _do_warn(_reason);
-  }
-}
-TORCH_API void setWarn(warn_fn_type fn);
-
-struct TORCH_API NoWarn {
-  NoWarn(): state(getTracingState()) {
-    if (state) {
-      prev = state->warn;
-      state->warn = false;
-    }
-  }
-  ~NoWarn() {
-    if (state) {
-      state->warn = prev;
-    }
-  }
-  std::shared_ptr<TracingState> state;
-  bool prev;
-};
-
-
 }}} // namespace torch::jit::tracer
diff --git a/torch/csrc/jit/tracing_state.h b/torch/csrc/jit/tracing_state.h
new file mode 100644
index 0000000000000..34808f0272e46
--- /dev/null
+++ b/torch/csrc/jit/tracing_state.h
@@ -0,0 +1,123 @@
+#pragma once
+
+#include "torch/csrc/autograd/function_hook.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/jit/assertions.h"
+#include "torch/csrc/jit/constants.h"
+#include "torch/csrc/jit/stack.h"
+#include "torch/csrc/utils/functional.h"
+#include "torch/csrc/utils/functional.h"
+#include "torch/csrc/utils/variadic.h"
+#include "torch/csrc/utils/variadic.h"
+#include "torch/csrc/WindowsTorchApiMacro.h"
+
+#include <ATen/Backtrace.h>
+
+#include <memory>
+#include <mutex>
+#include <vector>
+#include <iostream>
+#include <cstdint>
+#include <unordered_map>
+
+namespace torch { namespace jit { namespace tracer {
+
+using torch::autograd::Variable;
+using variable_list = std::vector<Variable>;
+
+struct TORCH_API TracingState : public std::enable_shared_from_this<TracingState> {
+  TracingState();
+  ~TracingState();
+
+  using WeakTensor = at::WeakTensor;
+
+  struct WeakTensorHasher {
+    size_t operator()(const WeakTensor& t) const {
+      return std::hash<void*>()(t.unsafeGetTensorImpl());
+    }
+  };
+
+  struct WeakTensorEq {
+    bool operator()(const WeakTensor& t1, const WeakTensor& t2) const {
+      return t1.is_same(t2);
+    }
+  };
+
+  std::unordered_map<WeakTensor, Value*, WeakTensorHasher, WeakTensorEq> value_map;
+  std::shared_ptr<Graph> graph;
+  bool warn = true;
+};
+
+
+// This is meant to be used as a thread local place, where we can store extra
+// info that gets lost when we call into ATen from Python bindings. One example
+// for when this happens is when we get an IntList argument with e.g. sizes for
+// view. When tracing, those might be tensors, which let us encode extra data
+// dependencies, but once they get to the ATen call where we actually have the
+// tracing logic, they get converted into a raw IntList, and we loose all
+// information. To prevent this, we temporarily stash it in here.
+struct ArgumentStash {
+  struct IntListTrace : std::vector<Value*> {
+    IntListTrace(int size)
+      : std::vector<Value*>(size, nullptr) {}
+  };
+
+  static bool empty() {
+    return stash.intlists.empty();
+  }
+
+  TORCH_API static void stashIntListElem(const std::string& arg_name,
+                                         size_t size,
+                                         size_t idx,
+                                         const Variable& var);
+
+  static bool hasIntList(const std::string& arg_name) {
+    return stash.intlists.count(arg_name) > 0;
+  }
+
+  static IntListTrace popIntList(const std::string& arg_name) {
+    auto info = std::move(stash.intlists.at(arg_name));
+    stash.intlists.erase(arg_name);
+    return info;
+  }
+
+private:
+  static thread_local ArgumentStash stash;
+  std::unordered_map<std::string, IntListTrace> intlists;
+};
+
+// Retrieve or set the current tracing state. Returns a nullptr if tracing is disabled.
+TORCH_API const std::shared_ptr<TracingState>& getTracingState();
+TORCH_API void setTracingState(std::shared_ptr<TracingState> state);
+
+inline bool isTracing() {
+  return static_cast<bool>(getTracingState());
+}
+
+using warn_fn_type = void (*)(const std::string& msg);
+TORCH_API void _do_warn(const char * _reason);
+inline void warn(const char * _reason) {
+  if (auto state = getTracingState()) {
+    if (!state->warn) return;
+    _do_warn(_reason);
+  }
+}
+TORCH_API void setWarn(warn_fn_type fn);
+
+struct TORCH_API NoWarn {
+  NoWarn(): state(getTracingState()) {
+    if (state) {
+      prev = state->warn;
+      state->warn = false;
+    }
+  }
+  ~NoWarn() {
+    if (state) {
+      state->warn = prev;
+    }
+  }
+  std::shared_ptr<TracingState> state;
+  bool prev;
+};
+
+}}} // namespace torch::jit::tracer
diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h
index 2c9c1d550ae2a..f3be38cbb2b00 100644
--- a/torch/csrc/jit/type.h
+++ b/torch/csrc/jit/type.h
@@ -71,10 +71,16 @@ struct TORCH_API Type : std::enable_shared_from_this<Type> {
   virtual bool isSubtypeOf(const TypePtr rhs) const {
     return *this == *rhs;
   }
-  // user-friendly form of the type, separate from
-  // operator<< which is verbose and unambiguous
+
+  // How this type will appear in FunctionSchema declarations
   virtual std::string str() const = 0;
 
+  // How this type will appear as if it were a type annotation in Python
+  // which is sometimes different than how it appears in declarations (e.g. int[] vs List[int])
+  virtual std::string python_str() const {
+    return str();
+  }
+
   TypeKind kind() const {
     return kind_;
   }
@@ -331,6 +337,11 @@ struct TORCH_API ListType : public Type {
     ss << getElementType()->str() << "[]";
     return ss.str();
   }
+  std::string python_str() const override {
+    std::stringstream ss;
+    ss << "List[" << getElementType()->python_str() << "]";
+    return ss.str();
+  }
   TypePtr getElementType() const {
     return elem;
   }
@@ -379,6 +390,17 @@ struct TORCH_API TupleType : public Type {
     ss << ")";
     return ss.str();
   }
+  std::string python_str() const override {
+    std::stringstream ss;
+    ss << "Tuple[";
+    for(size_t i = 0; i < elements().size(); ++i) {
+      if(i > 0)
+        ss << ", ";
+      ss << elements()[i]->python_str();
+    }
+    ss << "]";
+    return ss.str();
+  }
 private:
   TupleType(std::vector<TypePtr> elements_)
   : Type(TypeKind::TupleType)
diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index 65ddc5df8f98f..2188681906d53 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -8,6 +8,8 @@
 
 #include "torch/csrc/DynamicTypes.h"
 #include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/utils/python_tuples.h"
+#include "torch/csrc/utils/python_numbers.h"
 
 #include <stdexcept>
 
@@ -57,6 +59,39 @@ template<> struct type_caster<torch::autograd::Variable> {
   }
 };
 
+template<> struct type_caster<at::IntList> {
+public:
+  PYBIND11_TYPE_CASTER(at::IntList, _("at::IntList"));
+
+  bool load(handle src, bool) {
+    PyObject *source = src.ptr();
+    auto tuple = PyTuple_Check(source);
+    if (tuple || PyList_Check(source)) {
+      auto size = tuple ? PyTuple_GET_SIZE(source) : PyList_GET_SIZE(source);
+      v_value.resize(size);
+      for (int idx = 0; idx < size; idx++) {
+	PyObject* obj = tuple ? PyTuple_GET_ITEM(source, idx) : PyList_GET_ITEM(source, idx);
+	if (THPVariable_Check(obj)) {
+	  v_value[idx] = THPVariable_Unpack(obj).toCLong();
+	} else if (PyLong_Check(obj)) {
+	  // use THPUtils_unpackLong after it is safe to include python_numbers.h
+	  v_value[idx] = THPUtils_unpackLong(obj);
+	} else {
+	  return false;
+	}
+      }
+      value = v_value;
+      return true;
+    }
+    return false;
+  }
+  static handle cast(at::IntList src, return_value_policy /* policy */, handle /* parent */) {
+    return handle(THPUtils_packInt64Array(src.size(), src.data()));
+  }
+private:
+  std::vector<int64_t> v_value;
+};
+
 // http://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html#c-17-library-containers
 template <typename T>
 struct type_caster<at::optional<T>> : optional_caster<at::optional<T>> {};
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 71b8913a14970..a44ce9b3ed022 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -55,6 +55,7 @@
 #include "torch/csrc/utils/object_ptr.h"
 #include "torch/csrc/utils/python_numbers.h"
 #include "torch/csrc/utils/python_strings.h"
+#include "torch/csrc/autograd/variable.h"
 
 #include <ATen/ATen.h>
 
@@ -226,7 +227,7 @@ inline at::Scalar PythonArgs::scalarWithDefault(int i, at::Scalar default_scalar
   // Zero-dim tensors are converted to Scalars as-is. Note this doesn't currently
   // handle most NumPy scalar types except np.float64.
   if (THPVariable_Check(args[i])) {
-    return ((THPVariable*)args[i])->cdata._local_scalar();
+    return at::_local_scalar(((THPVariable*)args[i])->cdata);
   }
   if (THPUtils_checkLong(args[i])) {
     return at::Scalar(static_cast<int64_t>(THPUtils_unpackLong(args[i])));
diff --git a/torch/csrc/utils/python_numbers.h b/torch/csrc/utils/python_numbers.h
index 69f952bc6ec31..6a292e215108b 100644
--- a/torch/csrc/utils/python_numbers.h
+++ b/torch/csrc/utils/python_numbers.h
@@ -5,7 +5,7 @@
 #include <stdexcept>
 #include "torch/csrc/Exceptions.h"
 #include "torch/csrc/utils/tensor_numpy.h"
-#include "torch/csrc/jit/tracer.h"
+#include "torch/csrc/jit/tracing_state.h"
 
 // largest integer that can be represented consecutively in a double
 const int64_t DOUBLE_INT_MAX = 9007199254740992;
diff --git a/torch/csrc/utils/tensor_conversion_dispatch.cpp b/torch/csrc/utils/tensor_conversion_dispatch.cpp
index 030565e8339e3..45dfc956b2a2b 100644
--- a/torch/csrc/utils/tensor_conversion_dispatch.cpp
+++ b/torch/csrc/utils/tensor_conversion_dispatch.cpp
@@ -42,7 +42,7 @@ at::Tensor dispatch_type_conversion(
   switch (type.scalarType()) {
 #define DEFINE_CAST_DISPATCH(_1, n, _2)   \
   case at::ScalarType::n: {               \
-    return self._cast_##n(non_blocking); \
+    return at::_cast_##n(self, non_blocking); \
   } break;
     AT_FORALL_SCALAR_TYPES(DEFINE_CAST_DISPATCH)
 #undef DEFINE_CAST_DISPATCH
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 38fa1c8b7821b..bf0e3308b1e66 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -144,6 +144,9 @@ ScalarType infer_scalar_type(PyObject *obj) {
     return numpy_dtype_to_aten(PyArray_TYPE((PyArrayObject*)(PyArray_FromScalar(obj, nullptr))));
   }
 #endif
+  if (THPUtils_checkString(obj)) {
+    throw TypeError("new(): invalid data type '%s'", Py_TYPE(obj)->tp_name);
+  }
   if (PySequence_Check(obj)) {
     at::optional<ScalarType> scalarType;
     auto length = PySequence_Length(obj);
@@ -153,7 +156,9 @@ ScalarType infer_scalar_type(PyObject *obj) {
     for (int i = 0; i < length; ++i) {
       THPObjectPtr handle(PySequence_GetItem(obj, i));
       if (!handle) throw python_error();
-      ScalarType item_scalarType = infer_scalar_type(handle.get());
+      auto cur_item = handle.get();
+      if (cur_item == obj) throw TypeError("new(): self-referential lists are incompatible");
+      ScalarType item_scalarType = infer_scalar_type(cur_item);
       scalarType = (scalarType) ?
           at::promoteTypes(*scalarType, item_scalarType) : item_scalarType;
       if (scalarType == ScalarType::Double) {
diff --git a/torch/distributions/bernoulli.py b/torch/distributions/bernoulli.py
index 6939594035945..9db9691c4ddb9 100644
--- a/torch/distributions/bernoulli.py
+++ b/torch/distributions/bernoulli.py
@@ -83,11 +83,12 @@ def log_prob(self, value):
     def entropy(self):
         return binary_cross_entropy_with_logits(self.logits, self.probs, reduction='none')
 
-    def enumerate_support(self):
+    def enumerate_support(self, expand=True):
         values = self._new((2,))
         torch.arange(2, out=values)
         values = values.view((-1,) + (1,) * len(self._batch_shape))
-        values = values.expand((-1,) + self._batch_shape)
+        if expand:
+            values = values.expand((-1,) + self._batch_shape)
         return values
 
     @property
diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py
index e15138e58ea68..acbb636ce5ee1 100644
--- a/torch/distributions/binomial.py
+++ b/torch/distributions/binomial.py
@@ -101,12 +101,13 @@ def log_prob(self, value):
                 value * self.logits + self.total_count * max_val -
                 self.total_count * torch.log1p((self.logits + 2 * max_val).exp()))
 
-    def enumerate_support(self):
+    def enumerate_support(self, expand=True):
         total_count = int(self.total_count.max())
         if not self.total_count.min() == total_count:
             raise NotImplementedError("Inhomogeneous total count not supported by `enumerate_support`.")
         values = self._new(1 + total_count,)
         torch.arange(1 + total_count, out=values)
         values = values.view((-1,) + (1,) * len(self._batch_shape))
-        values = values.expand((-1,) + self._batch_shape)
+        if expand:
+            values = values.expand((-1,) + self._batch_shape)
         return values
diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
index 2f3a425deaefc..6dc046dfab42d 100644
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@@ -103,11 +103,12 @@ def entropy(self):
         p_log_p = self.logits * self.probs
         return -p_log_p.sum(-1)
 
-    def enumerate_support(self):
+    def enumerate_support(self, expand=True):
         num_events = self._num_events
         values = torch.arange(num_events).long()
         values = values.view((-1,) + (1,) * len(self._batch_shape))
-        values = values.expand((-1,) + self._batch_shape)
+        if expand:
+            values = values.expand((-1,) + self._batch_shape)
         if self._param.is_cuda:
             values = values.cuda(self._param.get_device())
         return values
diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py
index e93c15e63c780..3d1aeb9cadc9c 100644
--- a/torch/distributions/distribution.py
+++ b/torch/distributions/distribution.py
@@ -142,7 +142,7 @@ def icdf(self, value):
         """
         raise NotImplementedError
 
-    def enumerate_support(self):
+    def enumerate_support(self, expand=True):
         """
         Returns tensor containing all values supported by a discrete
         distribution. The result will enumerate over dimension 0, so the shape
@@ -150,8 +150,16 @@ def enumerate_support(self):
         (where `event_shape = ()` for univariate distributions).
 
         Note that this enumerates over all batched tensors in lock-step
-        `[[0, 0], [1, 1], ...]`. To iterate over the full Cartesian product
-        use `itertools.product(m.enumerate_support())`.
+        `[[0, 0], [1, 1], ...]`. With `expand=False`, enumeration happens
+        along dim 0, but with the remaining batch dimensions being
+        singleton dimensions, `[[0], [1], ..`.
+
+        To iterate over the full Cartesian product use
+        `itertools.product(m.enumerate_support())`.
+
+        Args:
+            expand (bool): whether to expand the support over the
+                batch dims to match the distribution's `batch_shape`.
 
         Returns:
             Tensor iterating over dimension 0.
diff --git a/torch/distributions/gamma.py b/torch/distributions/gamma.py
index f1dae3d7e138e..3f96fc031b234 100644
--- a/torch/distributions/gamma.py
+++ b/torch/distributions/gamma.py
@@ -8,7 +8,7 @@
 
 
 def _standard_gamma(concentration):
-    return concentration._standard_gamma()
+    return torch._standard_gamma(concentration)
 
 
 class Gamma(ExponentialFamily):
diff --git a/torch/distributions/independent.py b/torch/distributions/independent.py
index 1445cce0e4edf..938e478472a9b 100644
--- a/torch/distributions/independent.py
+++ b/torch/distributions/independent.py
@@ -82,7 +82,7 @@ def entropy(self):
         entropy = self.base_dist.entropy()
         return _sum_rightmost(entropy, self.reinterpreted_batch_ndims)
 
-    def enumerate_support(self):
+    def enumerate_support(self, expand=True):
         if self.reinterpreted_batch_ndims > 0:
             raise NotImplementedError("Enumeration over cartesian product is not implemented")
-        return self.base_dist.enumerate_support()
+        return self.base_dist.enumerate_support(expand=expand)
diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py
index fe2d2d7e6f9af..fbfec01ab7159 100644
--- a/torch/distributions/one_hot_categorical.py
+++ b/torch/distributions/one_hot_categorical.py
@@ -79,9 +79,11 @@ def log_prob(self, value):
     def entropy(self):
         return self._categorical.entropy()
 
-    def enumerate_support(self):
+    def enumerate_support(self, expand=True):
         n = self.event_shape[0]
         values = self._new((n, n))
         torch.eye(n, out=values)
         values = values.view((n,) + (1,) * len(self.batch_shape) + (n,))
-        return values.expand((n,) + self.batch_shape + (n,))
+        if expand:
+            values = values.expand((n,) + self.batch_shape + (n,))
+        return values
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index 159902c81c616..dd497d5c50da9 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -59,6 +59,15 @@ def scope(scope_name):
 
 
 def load(filename):
+    r"""
+        Load a ``ScriptModule`` previously saved with :func:`save <torch.jit.ScriptModule.save>`
+
+        Arguments:
+            filename (string): the file to load
+
+        Returns:
+            A ``ScriptModule`` object.
+    """
     m = ScriptModule()
     m._load(filename)
     return m
@@ -429,22 +438,25 @@ def trace(func, example_inputs, optimize=True, check_trace=True, check_inputs=No
 
     .. warning::
 
-        Just-in-time compilation currently only works for functions/modules
-        which are not data dependent (e.g., have conditionals on data in
-        tensors) and do not have any untracked external dependencies (e.g.,
-        perform input/output or access global variables). If you trace such
-        models, you will silently get incorrect results on subsequent
-        invocations of the model.
-
-    Arg:
-        func - a python function or torch.nn.Module that will be run with example_inputs.
-               arguments and returns to func must be Tensors or (possibly nested) tuples that
-               contain tensors.
-        example_inputs - a tuple of example inputs that will be passed to the function
-                         while tracing. The resulting trace can be run with
-                         inputs of different types and shapes assuming the traced operations
-                         support those types and shapes. example_inputs may also be a single
-                         Tensor in which case it is automatically wrapped in a tuple
+        Tracing only correctly records functions and modules which are not data
+        dependent (e.g., have conditionals on data in tensors) and do not have
+        any untracked external dependencies (e.g., perform input/output or
+        access global variables). If you trace such models, you may silently get
+        incorrect results on subsequent invocations of the model. The tracer
+        will try to emit warnings when doing something that may cause an
+        incorrect trace to be produced.
+
+    Arguments:
+        func (callable or torch.nn.Module):  a python function or torch.nn.Module
+                                             that will be run with example_inputs.
+                                             arguments and returns to func must be Tensors
+                                             or (possibly nested) tuples that
+                                             contain tensors.
+        example_inputs (tuple):  a tuple of example inputs that will be passed to the function
+                                 while tracing. The resulting trace can be run with
+                                 inputs of different types and shapes assuming the traced operations
+                                 support those types and shapes. example_inputs may also be a single
+                                 Tensor in which case it is automatically wrapped in a tuple
 
     Keyword arguments:
         optimize (bool, optional): whether or not to apply optimizations.  Default: ``True``.
@@ -454,20 +466,20 @@ def trace(func, example_inputs, optimize=True, check_trace=True, check_inputs=No
                                       deterministic ops or if you are sure that the network is correct despite
                                       a checker failure.
 
-        check_inputs (list of tuples. optional): A list of tuples of input arguments that should be used
+        check_inputs (list of tuples, optional): A list of tuples of input arguments that should be used
                                                  to check the trace against what is expected. Each tuple
                                                  is equivalent to a seet of input arguments that would
-                                                 be specified in `args`. For best results, pass in a
+                                                 be specified in ``args``. For best results, pass in a
                                                  set of checking inputs representative of the space of
                                                  shapes and types of inputs you expect the network to see.
-                                                 If not specified, the original `args` is used for checking
+                                                 If not specified, the original ``args`` is used for checking
         check_tolerance (float, optional): Floating-point comparison tolerance to use in the checker procedure.
                                            This can be used to relax the checker strictness in the event that
                                            results diverge numerically for a known reason, such as operator fusion.
 
     Returns:
-        A torch.jit.ScriptModule object with a single forward() method containing the traced code.
-        When func in s a torch.nn.Module, the returned ScriptModule will have the same set of
+        A ``ScriptModule`` object with a single ``forward()`` method containing the traced code.
+        When func is a ``torch.nn.Module``, the returned ``ScriptModule`` will have the same set of
         sub-modules and parameters as func.
 
     Example:
@@ -790,6 +802,7 @@ def __init__(cls, name, bases, attrs):
         super_constants = getattr(super(cls), '_constants_set', set())
         cls._constants_set = set(getattr(cls, '__constants__', ())).union(super_constants)
 
+        @functools.wraps(original_init)
         def init_then_register(self, *args, **kwargs):
             # ensure even if the user forgets to call super that
             # the pybind object is initialized so it will not segfault
@@ -807,6 +820,117 @@ def init_then_register(self, *args, **kwargs):
 
 if _enabled:
     class ScriptModule(with_metaclass(ScriptMeta, torch._C.ScriptModule, Module)):
+        r"""
+        The core data structure in Torch Script is the ``ScriptModule``. It is an
+        analogue of torch's nn.Module and represents an entire model as a tree of
+        submodules. Like normal modules, each individual module in a ScriptModule can
+        have submodules, parameters, and methods. In nn.Modules methods are implemented
+        as Python functions, but in ScriptModules methods typically implemented as
+        *Torch Script* functions,  a statically-typed subset of Python that contains all
+        of PyTorch's built-in Tensor operations. This difference allows your
+        ScriptModules code to run without the need for a Python interpreter.
+
+        ScriptModules and the Torch Script functions inside of them can be created in
+        two ways:
+
+        **Tracing:**
+
+            Using ``torch.jit.trace``, you can take an existing module or python
+            function, provide example inputs, and we run the function, recording the
+            operations performed on all the tensors. We turn the resulting recording
+            into a Torch Script method that is installed as the ``forward`` method of a
+            ScriptModule. This module also contains any parameters that the original
+            module had as well.
+
+            Example::
+
+                import torch
+                def foo(x, y):
+                    return 2*x + y
+                traced_foo = torch.jit.trace(foo, (torch.rand(3), torch.rand(3)))
+
+            .. note::
+                Tracing a *function* will produce a ScriptModule with a single
+                ``forward`` method that implements that function, and that contains
+                no parameteres.
+
+            Example::
+
+                import torch
+                import torchvision
+                traced_net = torch.jit.trace(torchvision.models.resnet18(),
+                                             torch.rand(1, 3, 224, 224))
+
+            .. note::
+
+                Since tracing only records operations on tensors, it will not record any
+                control-flow operations like if statements or loops. When this control-flow is
+                constant across your module, this is fine and it often just inlines
+                configuration decisions. But sometimes the control-flow is actually part of the
+                model itself. For instance, a beam search in sequence-to-sequence translation is
+                a loop over the (varying) sequence length of inputs. In this case tracing would
+                not be appropriate and the beam search should be written using scripting.
+
+        **Scripting:**
+
+            You can write Torch Script code directly using Python syntax. You do this
+            using the ``torch.jit.script`` annotation (for functions) or
+            ``torch.jit.script_method`` annotation (for methods) on subclasses of
+            ScriptModule. With this annotation the body of the annotated function is
+            directly translated into Torch Script. Torch Script itself is a subset of
+            the Python language, so not all features in python work, but we provide
+            enough functionality to compute on tensors and do control-dependent
+            operations.
+
+            Example::
+
+                import torch
+                @torch.jit.script
+                def foo(x, y):
+                    if x.max() > y.max():
+                        r = x
+                    else:
+                        r = y
+                    return r
+
+            .. note::
+                A script *function* annotation will construct a ScriptModule
+                with a single ``forward`` method that implements that function,
+                and that contains no parameters.
+
+            Example::
+
+              import torch
+              class MyModule(torch.jit.ScriptModule):
+                  def __init__(self, N, M):
+                      super(MyModule, self).__init__()
+                      self.weight = torch.nn.Parameter(torch.rand(N, M))
+
+                  @torch.jit.script_method
+                  def forward(self, input):
+                      return self.weight.mv(input)
+
+            Example::
+
+                import torch
+                import torch.nn as nn
+                import torch.nn.functional as F
+                from torch.jit import ScriptModule, script_method, trace
+
+                class MyScriptModule(ScriptModule):
+                    def __init__(self):
+                        super(MyScriptModule, self).__init__()
+                        # trace produces a ScriptModule's conv1 and conv2
+                        self.conv1 = trace(nn.Conv2d(1, 20, 5), torch.rand(1, 1, 16, 16))
+                        self.conv2 = trace(nn.Conv2d(20, 20, 5), torch.rand(1, 20, 16, 16))
+
+                    @script_method
+                    def forward(self, input):
+                      input = F.relu(self.conv1(input))
+                      input = F.relu(self.conv2(input))
+                      return input
+        """
+
         def __init__(self, optimize=True):
             # must be before Module.init since the field is used in __getattr__
             Module.__init__(self)
@@ -999,6 +1123,8 @@ def forward(self, input):
 
 _builtin_table = None
 
+_modules_containing_builtins = (torch, torch.nn.functional)
+
 
 # lazily built to ensure the correct initialization order
 def _get_builtin_table():
@@ -1012,8 +1138,9 @@ def register_all(mod):
             v = getattr(mod, name)
             if callable(v):
                 _builtin_table[id(v)] = "aten::" + name
-    register_all(torch)
-    register_all(torch.nn.functional)
+    for mod in _modules_containing_builtins:
+        register_all(mod)
+
     return _builtin_table
 
 
diff --git a/torch/jit/supported_ops.py b/torch/jit/supported_ops.py
new file mode 100644
index 0000000000000..fbb5bf54975e3
--- /dev/null
+++ b/torch/jit/supported_ops.py
@@ -0,0 +1,86 @@
+import torch.jit
+# this file is for generating documentation using sphinx autodoc
+# > help(torch.jit.supported_ops) will also give a nice listed of the
+# supported ops programmatically
+
+
+def _list_supported_ops():
+    def emit_type(type):
+        return str(type)
+
+    def emit_arg(indent, i, arg):
+        v = "{} : {}".format(arg.name, emit_type(arg.type))
+        default = arg.default_value
+        if default is not None:
+            v = "{}={}".format(v, str(default))
+        if i > 0:
+            v = "\n{}{}".format(" " * indent, v)
+        return v
+
+    def emit_args(indent, arguments):
+        return ",".join(emit_arg(indent, i, arg) for i, arg in enumerate(arguments))
+
+    def emit_ret(ret):
+        return emit_type(ret.type)
+
+    def emit_rets(returns):
+        if len(returns) == 1:
+            return emit_ret(returns[0])
+        return "Tuple[{}]".format(", ".join(emit_ret(r) for r in returns))
+
+    def emit_schema(mod, name, schema, arg_start=0):
+        qualified_name = "{}.{}".format(mod, name)
+        schema = "{}({}) -> {}".format(qualified_name,
+                                       emit_args(len(qualified_name) + 1 + 4, schema.arguments[arg_start:]),
+                                       emit_rets(schema.returns))
+        return schema
+
+    def hidden(name):
+        return name.startswith('_') and not name.startswith('__')
+
+    functions = []
+
+    for mod in torch.jit._modules_containing_builtins:
+        name = mod.__name__
+        for elem in dir(mod):
+            builtin = torch.jit._find_builtin(getattr(mod, elem))
+            if builtin is not None:
+                schemas = torch._C._jit_get_schemas_for_operator(builtin)
+                for schema in schemas:
+                    # remove _tan but not __and__
+                    if not hidden(elem):
+                        functions.append(emit_schema(name, elem, schema))
+
+    def is_tensor_method(schema):
+        if len(schema.arguments) == 0:
+            return False
+        self = schema.arguments[0]
+        if self.name != 'self':
+            return False
+        if not self.type.isSubtypeOf(torch._C.DynamicType.get()):
+            return False
+        return True
+
+    methods = []
+    # discover methods
+    for elem in dir(torch.Tensor):
+        if not hidden(elem):
+            schemas = torch._C._jit_get_schemas_for_operator("aten::" + elem)
+            for schema in schemas:
+                if is_tensor_method(schema):
+                    methods.append(emit_schema('Tensor', elem, schema, arg_start=1))
+
+    def emit_block(decls):
+        return '\n::\n\n{}\n'.format(''.join('    {}\n\n'.format(d) for d in decls))
+    body = """
+Supported Functions
+~~~~~~~~~~~~~~~~~~~
+{}
+
+Supported Methods
+~~~~~~~~~~~~~~~~~
+{}
+"""
+    return body.format(emit_block(functions), emit_block(methods))
+
+__doc__ = _list_supported_ops()
diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
index a2c14377bc38f..9a09065bc4e38 100644
--- a/torch/lib/c10d/CMakeLists.txt
+++ b/torch/lib/c10d/CMakeLists.txt
@@ -2,7 +2,6 @@ cmake_minimum_required(VERSION 3.2 FATAL_ERROR)
 
 # Find modules.
 list(APPEND CMAKE_MODULE_PATH
-  /usr/lib/x86_64-linux-gnu/
   ${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/public
   ${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules
   ${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules_CUDA_fix)
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index c964c54e3d8bf..b7bf001be9f89 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -200,6 +200,70 @@ void ProcessGroupGloo::WorkGloo::finishWithException(
   cv_.notify_all();
 }
 
+ProcessGroupGloo::SendWork::SendWork(
+    at::Tensor& tensor,
+    std::unique_ptr<::gloo::transport::UnboundBuffer> buffer)
+    : tensor_(tensor),
+      buffer_(std::move(buffer)) {
+}
+
+bool ProcessGroupGloo::SendWork::isCompleted() const {
+  // No way to poll for completion yet
+  return true;
+}
+
+bool ProcessGroupGloo::SendWork::isSuccess() const {
+  // No way to fail yet
+  return true;
+}
+
+void ProcessGroupGloo::SendWork::synchronize() {
+  // CPU only, no need to synchronize
+  return;
+}
+
+bool ProcessGroupGloo::SendWork::wait() {
+  buffer_->waitSend();
+  return true;
+}
+
+const std::exception& ProcessGroupGloo::SendWork::exception() const {
+  throw std::runtime_error("no exception");
+}
+
+ProcessGroupGloo::RecvWork::RecvWork(
+    at::Tensor& tensor,
+    std::unique_ptr<::gloo::transport::UnboundBuffer> buffer,
+    int* srcRank)
+    : tensor_(tensor),
+      buffer_(std::move(buffer)),
+      srcRank_(srcRank) {
+}
+
+bool ProcessGroupGloo::RecvWork::isCompleted() const {
+  // No way to poll for completion yet
+  return true;
+}
+
+bool ProcessGroupGloo::RecvWork::isSuccess() const {
+  // No way to fail yet
+  return true;
+}
+
+void ProcessGroupGloo::RecvWork::synchronize() {
+  // CPU only, no need to synchronize
+  return;
+}
+
+bool ProcessGroupGloo::RecvWork::wait() {
+  buffer_->waitRecv(srcRank_);
+  return true;
+}
+
+const std::exception& ProcessGroupGloo::RecvWork::exception() const {
+  throw std::runtime_error("no exception");
+}
+
 ProcessGroupGloo::Options::Options()
     : timeout(std::chrono::milliseconds(10 * 1000)),
       threads(2),
@@ -595,22 +659,79 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::scatter(
   throw std::runtime_error("ProcessGroupGloo does not support scatter");
 }
 
+at::Tensor& checkSingleTensor(std::vector<at::Tensor>& tensors) {
+  if (tensors.size() != 1) {
+    throw std::runtime_error("ProcessGroupGloo::send takes a single tensor");
+  }
+  auto& tensor = tensors[0];
+  if (!tensor.is_contiguous()) {
+    throw std::runtime_error("input tensor has to be contiguous");
+  }
+  if (tensor.is_sparse()) {
+    throw std::runtime_error("input tensor has to be dense");
+  }
+  return tensor;
+}
+
 std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::send(
-    std::vector<at::Tensor>& /* unused */,
-    int /* unused */) {
-  throw std::runtime_error("ProcessGroupGloo does not support send");
+    std::vector<at::Tensor>& tensors,
+    int dstRank) {
+  auto& tensor = checkSingleTensor(tensors);
+  auto ptr = tensor.data_ptr();
+  auto size = tensor.numel() * tensor.type().elementSizeInBytes();
+
+  // Construct unbound buffer.
+  auto& context = contexts_[0];
+  auto buf = context->createUnboundBuffer(ptr, size);
+  buf->send(dstRank, 0);
+
+  // The work captures the tensor to prevent it being deallocated and
+  // the unbound buffer to synchronize on completion of the send.
+  return std::make_shared<SendWork>(tensor, std::move(buf));
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recv(
-    std::vector<at::Tensor>& /* unused */,
-    int /* unused */) {
-  throw std::runtime_error("ProcessGroupGloo does not support recv");
+    std::vector<at::Tensor>& tensors,
+    int srcRank) {
+  auto& tensor = checkSingleTensor(tensors);
+  auto ptr = tensor.data_ptr();
+  auto size = tensor.numel() * tensor.type().elementSizeInBytes();
+
+  // Construct unbound buffer.
+  auto& context = contexts_[0];
+  auto buf = context->createUnboundBuffer(ptr, size);
+  buf->recv(srcRank, 0);
+
+  // The work captures the tensor to prevent it being deallocated and
+  // the unbound buffer to synchronize on completion of the recv.
+  return std::make_shared<RecvWork>(tensor, std::move(buf), nullptr);
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::recvAnysource(
-    std::vector<at::Tensor>& /* unused */,
-    int* /* unused */) {
-  throw std::runtime_error("ProcessGroupGloo does not support recv");
+    std::vector<at::Tensor>& tensors,
+    int* srcRank) {
+  auto& tensor = checkSingleTensor(tensors);
+  auto ptr = tensor.data_ptr();
+  auto size = tensor.numel() * tensor.type().elementSizeInBytes();
+
+  // Construct unbound buffer.
+  auto& context = contexts_[0];
+  auto buf = context->createUnboundBuffer(ptr, size);
+
+  // Build list of ranks that this operation can recv from. In these
+  // bindings we don't differentiate between ranks and can receive
+  // from any other process in the group.
+  std::vector<int> srcRanks;
+  srcRanks.resize(size_);
+  for (auto i = 0; i < size_; i++) {
+    srcRanks.push_back(i);
+  }
+
+  buf->recv(srcRanks, 0);
+
+  // The work captures the tensor to prevent it being deallocated and
+  // the unbound buffer to synchronize on completion of the recv.
+  return std::make_shared<RecvWork>(tensor, std::move(buf), srcRank);
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::barrier() {
diff --git a/torch/lib/c10d/ProcessGroupGloo.hpp b/torch/lib/c10d/ProcessGroupGloo.hpp
index c5682433daffd..9d5be5d200d3d 100644
--- a/torch/lib/c10d/ProcessGroupGloo.hpp
+++ b/torch/lib/c10d/ProcessGroupGloo.hpp
@@ -212,6 +212,60 @@ class ProcessGroupGloo : public ProcessGroup {
     friend class ProcessGroupGloo;
   };
 
+  // For send and recv operations there is no need to pass them to the
+  // thread pool as they are entirely completed by the device thread.
+  // This work object is used to synchronize completion of the send or
+  // recv operation. It keeps a reference to the tensor it is
+  // operating on to prevent it from being deallocated while the
+  // operation is still in flight.
+  class SendWork : public ProcessGroup::Work {
+   public:
+    explicit SendWork(
+        at::Tensor& tensor,
+        std::unique_ptr<::gloo::transport::UnboundBuffer> buffer);
+
+    virtual ~SendWork() = default;
+
+    bool isCompleted() const override;
+
+    bool isSuccess() const override;
+
+    void synchronize() override;
+
+    bool wait() override;
+
+    const std::exception& exception() const override;
+
+   protected:
+    at::Tensor tensor_;
+    std::unique_ptr<::gloo::transport::UnboundBuffer> buffer_;
+  };
+
+  class RecvWork : public ProcessGroup::Work {
+   public:
+    explicit RecvWork(
+        at::Tensor& tensor,
+        std::unique_ptr<::gloo::transport::UnboundBuffer> buffer,
+        int* srcRank);
+
+    virtual ~RecvWork() = default;
+
+    bool isCompleted() const override;
+
+    bool isSuccess() const override;
+
+    void synchronize() override;
+
+    bool wait() override;
+
+    const std::exception& exception() const override;
+
+   protected:
+    at::Tensor tensor_;
+    std::unique_ptr<::gloo::transport::UnboundBuffer> buffer_;
+    int* srcRank_;
+  };
+
   struct Options {
     explicit Options();
 
diff --git a/torch/lib/c10d/Store.hpp b/torch/lib/c10d/Store.hpp
index d24fac275dcfe..155c9ac78b2d3 100644
--- a/torch/lib/c10d/Store.hpp
+++ b/torch/lib/c10d/Store.hpp
@@ -11,7 +11,7 @@ namespace c10d {
 class Store {
  public:
   static constexpr std::chrono::milliseconds kDefaultTimeout =
-      std::chrono::seconds(30);
+      std::chrono::seconds(300);
   static constexpr std::chrono::milliseconds kNoTimeout =
       std::chrono::milliseconds::zero();
 
diff --git a/torch/nn/init.py b/torch/nn/init.py
index a5c8239d3dc96..29de7b54a1490 100644
--- a/torch/nn/init.py
+++ b/torch/nn/init.py
@@ -178,7 +178,7 @@ def dirac_(tensor):
 def _calculate_fan_in_and_fan_out(tensor):
     dimensions = tensor.ndimension()
     if dimensions < 2:
-        raise ValueError("Fan in and fan out can not be computed for tensor with less than 2 dimensions")
+        raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
 
     if dimensions == 2:  # Linear
         fan_in = tensor.size(1)
diff --git a/torch/nn/modules/adaptive.py b/torch/nn/modules/adaptive.py
index 88ae064c6bd93..471e1ce32247c 100644
--- a/torch/nn/modules/adaptive.py
+++ b/torch/nn/modules/adaptive.py
@@ -25,7 +25,7 @@ class AdaptiveLogSoftmaxWithLoss(Module):
     Adaptive softmax partitions the labels into several clusters, according to
     their frequency. These clusters may contain different number of targets
     each.
-    Additionally, clusters containig less frequent labels assign lower
+    Additionally, clusters containing less frequent labels assign lower
     dimensional embeddings to those labels, which speeds up the computation.
     For each minibatch, only clusters for which at least one target is
     present are evaluated.
diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py
index daa03f9f58511..78a3bbfc2c79f 100644
--- a/torch/nn/parallel/distributed_c10d.py
+++ b/torch/nn/parallel/distributed_c10d.py
@@ -164,7 +164,9 @@ def __init__(self, module, device_ids=None,
         else:
             self._module_copies = [self.module]
 
+        # .data() of each parameter for each model replica
         self.modules_params_data = [[] for _ in range(len(self.device_ids))]
+        # .data() of each buffer for each model replica
         self.modules_buffers_data = [[] for _ in range(len(self.device_ids))]
 
         for dev_idx, module in enumerate(self._module_copies):
@@ -252,29 +254,12 @@ def _dist_broadcast_coalesced(self, tensors, buffer_size):
         c10d._dist_broadcast_coalesced(tensors, buffer_size, self.process_group)
 
     def _sync_params(self):
-        if len(self.device_ids) > 1:
-            # intra-node parameter sync
-            result = broadcast_coalesced(self.modules_params_data[0],
-                                         self.device_ids,
-                                         self.broadcast_bucket_size)
-            for tensors, module_params_data in zip(result[1:], self.modules_params_data[1:]):
-                for tensor, param_data in zip(tensors, module_params_data):
-                    param_data.set_(tensor)
-
-        # module buffer sync
-        if self.broadcast_buffers:
-            if len(self.modules_buffers_data[0]) > 0:
-                # cross-node buffer sync
-                self._dist_broadcast_coalesced(self.modules_buffers_data[0],
-                                               self.broadcast_bucket_size)
-                if len(self.device_ids) > 1:
-                    # intra-node buffer sync
-                    result = broadcast_coalesced(self.modules_buffers_data[0],
-                                                 self.device_ids,
-                                                 self.broadcast_bucket_size)
-                    for tensors, module_buffers_data in zip(result[1:], self.modules_buffers_data[1:]):
-                        for tensor, buffer_data in zip(tensors, module_buffers_data):
-                            buffer_data.set_(tensor)
+        c10d._sync_params(self.process_group,
+                          self.modules_params_data,
+                          self.modules_buffers_data,
+                          self.device_ids,
+                          self.broadcast_bucket_size,
+                          self.broadcast_buffers)
 
     def _register_grad_hooks(self):
         self._grad_accs = []  # need to keep them in scope
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index c1a4fb7267255..23e95e282c440 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -1097,11 +1097,11 @@ def retrieve_state(x, start, end):
             # The ONNX RNN/GRU/LSTM produce an output of dimensions
             #   seq_len, num_directions, batch, hidden_size
             # We have to convert to match pytorch's expected
-            #   seq_len, batch, hidden_size * num_directions
-            # by first moving num_directions to the end with
+            #   seq_len, batch, num_directions * hidden_size
+            # by first moving num_directions before hidden_size with
             # Transpose, and then combining it with hidden_size
             # with Reshape.
-            prev_output = g.op('Transpose', prev_output, perm_i=[0, 2, 3, 1])
+            prev_output = g.op('Transpose', prev_output, perm_i=[0, 2, 1, 3])
             prev_output = g.op('Reshape', prev_output, g.op('Constant', value_t=torch.LongTensor([0, 0, -1])))
         else:
             prev_output = g.op('Squeeze', prev_output, axes_i=[1])
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index b39312df028a1..b2fff3ed089d4 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -85,7 +85,7 @@ def make_sparse(values):
                             return constructor().resize_as_(grad)
                         return constructor(grad_indices, values, size)
                     state['sum'].add_(make_sparse(grad_values.pow(2)))
-                    std = state['sum']._sparse_mask(grad)
+                    std = state['sum'].sparse_mask(grad)
                     std_values = std._values().sqrt_().add_(1e-10)
                     p.data.add_(-clr, make_sparse(grad_values / std_values))
                 else:
diff --git a/torch/optim/sparse_adam.py b/torch/optim/sparse_adam.py
index 74d15a937861e..afb23bfa462a2 100644
--- a/torch/optim/sparse_adam.py
+++ b/torch/optim/sparse_adam.py
@@ -82,14 +82,14 @@ def make_sparse(values):
                 # Decay the first and second moment running average coefficient
                 #      old <- b * old + (1 - b) * new
                 # <==> old += (1 - b) * (new - old)
-                old_exp_avg_values = exp_avg._sparse_mask(grad)._values()
+                old_exp_avg_values = exp_avg.sparse_mask(grad)._values()
                 exp_avg_update_values = grad_values.sub(old_exp_avg_values).mul_(1 - beta1)
                 exp_avg.add_(make_sparse(exp_avg_update_values))
-                old_exp_avg_sq_values = exp_avg_sq._sparse_mask(grad)._values()
+                old_exp_avg_sq_values = exp_avg_sq.sparse_mask(grad)._values()
                 exp_avg_sq_update_values = grad_values.pow(2).sub_(old_exp_avg_sq_values).mul_(1 - beta2)
                 exp_avg_sq.add_(make_sparse(exp_avg_sq_update_values))
 
-                # Dense addition again is intended, avoiding another _sparse_mask
+                # Dense addition again is intended, avoiding another sparse_mask
                 numer = exp_avg_update_values.add_(old_exp_avg_values)
                 exp_avg_sq_update_values.add_(old_exp_avg_sq_values)
                 denom = exp_avg_sq_update_values.sqrt_().add_(group['eps'])
diff --git a/torch/tensor.py b/torch/tensor.py
index 2c655d74af8c3..f72db6d138e58 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -325,13 +325,15 @@ def unique(self, sorted=False, return_inverse=False, dim=None):
         See :func:`torch.unique`
         """
         if dim is not None:
-            output, inverse_indices = self._unique_dim(
+            output, inverse_indices = torch._unique_dim(
+                self,
                 sorted=sorted,
                 return_inverse=return_inverse,
                 dim=dim
             )
         else:
-            output, inverse_indices = self._unique(
+            output, inverse_indices = torch._unique(
+                self,
                 sorted=sorted,
                 return_inverse=return_inverse
             )
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index ff30ab0a8bc30..0af802625087c 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -350,7 +350,7 @@ def CUDAExtension(name, sources, *args, **kwargs):
 
     Example:
         >>> from setuptools import setup
-        >>> from torch.utils.cpp_extension import BuildExtension, CppExtension
+        >>> from torch.utils.cpp_extension import BuildExtension, CUDAExtension
         >>> setup(
                 name='cuda_extension',
                 ext_modules=[
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 60789e9fb6299..951321fcdf105 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -256,7 +256,6 @@ def __init__(self, loader):
 
         if self.num_workers > 0:
             self.worker_init_fn = loader.worker_init_fn
-            self.index_queues = [multiprocessing.Queue() for _ in range(self.num_workers)]
             self.worker_queue_idx = 0
             self.worker_result_queue = multiprocessing.Queue()
             self.batches_outstanding = 0
@@ -267,14 +266,26 @@ def __init__(self, loader):
             self.reorder_dict = {}
             self.done_event = multiprocessing.Event()
 
-            self.workers = [
-                multiprocessing.Process(
+            self.index_queues = []
+            self.workers = []
+            for i in range(self.num_workers):
+                index_queue = multiprocessing.Queue()
+                w = multiprocessing.Process(
                     target=_worker_loop,
-                    args=(self.dataset, self.index_queues[i],
+                    args=(self.dataset, index_queue,
                           self.worker_result_queue, self.done_event,
                           self.collate_fn, base_seed + i,
                           self.worker_init_fn, i))
-                for i in range(self.num_workers)]
+                w.daemon = True  # ensure that the worker exits on process exit
+                # Process.start() actually take some time as it needs to start a
+                # process and pass the arguments over via a pipe. Therefore, we
+                # only add a worker to self.workers list after it started, so
+                # that we do not call .join() if program dies before it starts,
+                # and __del__ tries to join it but will get:
+                #     AssertionError: can only join a started process.
+                w.start()
+                self.index_queues.append(index_queue)
+                self.workers.append(w)
 
             if self.pin_memory:
                 self.data_queue = queue.Queue()
@@ -287,10 +298,6 @@ def __init__(self, loader):
             else:
                 self.data_queue = self.worker_result_queue
 
-            for w in self.workers:
-                w.daemon = True  # ensure that the worker exits on process exit
-                w.start()
-
             _update_worker_pids(id(self), tuple(w.pid for w in self.workers))
             _set_SIGCHLD_handler()
             self.worker_pids_set = True