diff --git a/.clang-tidy b/.clang-tidy
index d5fc66c26d42d9..5466a4a31d20a3 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -2,7 +2,6 @@
 # NOTE: there must be no spaces before the '-', so put the comma first.
 Checks: '
   *
-  ,clang-analyzer-*
   ,modernize-*
   ,-cert-err58-cpp
   ,-cert-err60-cpp
@@ -10,7 +9,6 @@ Checks: '
   ,-cppcoreguidelines-owning-memory
   ,-cppcoreguidelines-pro-bounds-array-to-pointer-decay
   ,-cppcoreguidelines-pro-bounds-constant-array-index
-  ,-cppcoreguidelines-pro-type-member-init
   ,-cppcoreguidelines-pro-type-static-cast-downcast
   ,-cppcoreguidelines-pro-type-vararg
   ,-cppcoreguidelines-special-member-functions
@@ -25,11 +23,9 @@ Checks: '
   ,-hicpp-braces-around-statements
   ,-hicpp-explicit-conversions
   ,-hicpp-no-array-decay
-  ,-hicpp-signed-bitwise
   ,-hicpp-special-member-functions
   ,-hicpp-vararg
   ,-llvm-header-guard
-  ,-llvm-include-order
   ,-llvm-namespace-comment
   ,-misc-unused-parameters
   ,-modernize-make-unique
@@ -38,6 +34,7 @@ Checks: '
   ,-readability-braces-around-statements
   ,-readability-else-after-return
   ,-readability-named-parameter
+  ,clang-analyzer-*
   '
 WarningsAsErrors: ''
 HeaderFilterRegex: 'torch/csrc/'
diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index cd41d1a02f8290..00000000000000
--- a/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-*.bat	text eol=crlf
diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
index 3bc5157d9cab7a..6b8aa6fc62bb94 100755
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@@ -124,7 +124,7 @@ CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
 CMAKE_ARGS+=("-DUSE_ZSTD=ON")
 CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
 
-if [[ $BUILD_ENVIRONMENT == *-aten-* || -n "$INTEGRATED" ]]; then
+if [[ $BUILD_ENVIRONMENT == *-aten-* ]]; then
   if [[ CMAKE_ARGS != *USE_ATEN* ]] && [[ CMAKE_ARGS != *BUILD_ATEN* ]]; then
     CMAKE_ARGS+=("-DBUILD_ATEN=ON")
   fi
diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 40e3e21417b9b2..053a9be5e05487 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -64,13 +64,7 @@ for test in $(find "${INSTALL_PREFIX}/test" -executable -type f); do
       ;;
     */aten/*)
       # ATen uses test framework Catch2
-      # NB: We do NOT use the xml test reporter, because
-      # Catch doesn't support multiple reporters
-      # c.f. https://github.com/catchorg/Catch2/blob/master/docs/release-notes.md#223
-      # which means that enabling XML output means you lose useful stdout
-      # output for Jenkins.  It's more important to have useful console
-      # output than it is to have XML output for Jenkins.
-      "$test"
+      "$test" -r=xml -o "${junit_reports_dir}/$(basename $test).xml"
       ;;
     *)
       "$test" --gtest_output=xml:"$gtest_reports_dir/$(basename $test).xml"
@@ -115,10 +109,6 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then
   # Our cuda top_k op has some asm code, the hipified version doesn't
   # compile yet, so we don't have top_k operator for now
   rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py")
-
-  # Our AMD CI boxes have 4 gpus on each
-  # Remove this once we have added multi-gpu support
-  export HIP_VISIBLE_DEVICES=$(($BUILD_NUMBER % 4))
 fi
 
 # Python tests
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 48e81dfd635bce..56db6914c1c20a 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -43,9 +43,12 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   # https://github.com/RadeonOpenCompute/hcc#hcc-with-thinlto-linking
   export KMTHINLTO=1
 
-  python tools/amd_build/build_pytorch_amd.py
-  USE_ROCM=1 python setup.py install --user
-  exit 0
+  sudo chown -R jenkins:jenkins /usr/local
+  rm -rf "$(dirname "${BASH_SOURCE[0]}")/../../../pytorch_amd/" || true
+  python "$(dirname "${BASH_SOURCE[0]}")/../../tools/amd_build/build_pytorch_amd.py"
+
+  USE_ROCM=1 python setup.py install
+  exit
 fi
 
 # TODO: Don't install this here
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7eb20d1336550..651e230ab35ea7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -214,10 +214,9 @@ if(NOT MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-overflow")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-strict-aliasing")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-stringop-overflow")
   # These flags are not available in GCC-4.8.5. Set only when using clang.
   # Compared against https://gcc.gnu.org/onlinedocs/gcc-4.8.5/gcc/Option-Summary.html
-  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-invalid-partial-specialization")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-typedef-redefinition")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-warning-option")
@@ -227,7 +226,6 @@ if(NOT MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++14-extensions")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-constexpr-not-const")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-braces")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments")
   endif()
   if ((APPLE AND (NOT ("${CLANG_VERSION_STRING}" VERSION_LESS "9.0")))
     OR (CMAKE_COMPILER_IS_GNUCXX 
@@ -286,8 +284,6 @@ include_directories(BEFORE ${PROJECT_SOURCE_DIR})
 # in PROJECT_SOURCE_DIR.
 include_directories(BEFORE ${PROJECT_BINARY_DIR})
 
-include_directories(BEFORE ${PROJECT_SOURCE_DIR}/aten/src/)
-
 # ---[ Old caffe protobuf
 if(BUILD_CAFFE2)
   add_subdirectory(caffe/proto)
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index 2f2ffdce186d39..462a12b086d2d0 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -146,5 +146,4 @@ if (CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
   set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
   set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
   set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
-  set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
 endif()
diff --git a/aten/src/ATen/core/AlignOf.h b/aten/src/ATen/AlignOf.h
similarity index 68%
rename from aten/src/ATen/core/AlignOf.h
rename to aten/src/ATen/AlignOf.h
index a7e42196f43ecd..5e9f0127b32e70 100644
--- a/aten/src/ATen/core/AlignOf.h
+++ b/aten/src/ATen/AlignOf.h
@@ -33,7 +33,7 @@ namespace at {
 // MSVC requires special handling here.
 #ifndef _MSC_VER
 
-template <size_t Alignment, size_t Size>
+template<size_t Alignment, size_t Size>
 struct AlignedCharArray {
   alignas(Alignment) char buffer[Size];
 };
@@ -41,7 +41,7 @@ struct AlignedCharArray {
 #else // _MSC_VER
 
 /// \brief Create a type with an aligned char buffer.
-template <size_t Alignment, size_t Size>
+template<size_t Alignment, size_t Size>
 struct AlignedCharArray;
 
 // We provide special variations of this template for the most common
@@ -52,7 +52,7 @@ struct AlignedCharArray;
 // MSVC warns on the existence of the declspec despite the union member forcing
 // proper alignment.
 
-template <size_t Size>
+template<size_t Size>
 struct AlignedCharArray<1, Size> {
   union {
     char aligned;
@@ -60,7 +60,7 @@ struct AlignedCharArray<1, Size> {
   };
 };
 
-template <size_t Size>
+template<size_t Size>
 struct AlignedCharArray<2, Size> {
   union {
     short aligned;
@@ -68,7 +68,7 @@ struct AlignedCharArray<2, Size> {
   };
 };
 
-template <size_t Size>
+template<size_t Size>
 struct AlignedCharArray<4, Size> {
   union {
     int aligned;
@@ -76,7 +76,7 @@ struct AlignedCharArray<4, Size> {
   };
 };
 
-template <size_t Size>
+template<size_t Size>
 struct AlignedCharArray<8, Size> {
   union {
     double aligned;
@@ -84,13 +84,14 @@ struct AlignedCharArray<8, Size> {
   };
 };
 
+
 // The rest of these are provided with a __declspec(align(...)) and we simply
 // can't pass them by-value as function arguments on MSVC.
 
 #define AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
-  template <size_t Size>                          \
-  struct AlignedCharArray<x, Size> {              \
-    __declspec(align(x)) char buffer[Size];       \
+  template<size_t Size> \
+  struct AlignedCharArray<x, Size> { \
+    __declspec(align(x)) char buffer[Size]; \
   };
 
 AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16)
@@ -103,47 +104,24 @@ AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128)
 #endif // _MSC_VER
 
 namespace detail {
-template <
-    typename T1,
-    typename T2 = char,
-    typename T3 = char,
-    typename T4 = char,
-    typename T5 = char,
-    typename T6 = char,
-    typename T7 = char,
-    typename T8 = char,
-    typename T9 = char,
-    typename T10 = char>
+template <typename T1,
+          typename T2 = char, typename T3 = char, typename T4 = char,
+          typename T5 = char, typename T6 = char, typename T7 = char,
+          typename T8 = char, typename T9 = char, typename T10 = char>
 class AlignerImpl {
-  T1 t1;
-  T2 t2;
-  T3 t3;
-  T4 t4;
-  T5 t5;
-  T6 t6;
-  T7 t7;
-  T8 t8;
-  T9 t9;
-  T10 t10;
+  T1 t1; T2 t2; T3 t3; T4 t4; T5 t5; T6 t6; T7 t7; T8 t8; T9 t9; T10 t10;
 
   AlignerImpl() = delete;
 };
 
-template <
-    typename T1,
-    typename T2 = char,
-    typename T3 = char,
-    typename T4 = char,
-    typename T5 = char,
-    typename T6 = char,
-    typename T7 = char,
-    typename T8 = char,
-    typename T9 = char,
-    typename T10 = char>
+template <typename T1,
+          typename T2 = char, typename T3 = char, typename T4 = char,
+          typename T5 = char, typename T6 = char, typename T7 = char,
+          typename T8 = char, typename T9 = char, typename T10 = char>
 union SizerImpl {
   char arr1[sizeof(T1)], arr2[sizeof(T2)], arr3[sizeof(T3)], arr4[sizeof(T4)],
-      arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)],
-      arr9[sizeof(T9)], arr10[sizeof(T10)];
+       arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)],
+       arr9[sizeof(T9)], arr10[sizeof(T10)];
 };
 } // end namespace detail
 
@@ -154,20 +132,14 @@ union SizerImpl {
 /// expose a char array buffer member which can be used as suitable storage for
 /// a placement new of any of these types. Support for more than ten types can
 /// be added at the cost of more boilerplate.
-template <
-    typename T1,
-    typename T2 = char,
-    typename T3 = char,
-    typename T4 = char,
-    typename T5 = char,
-    typename T6 = char,
-    typename T7 = char,
-    typename T8 = char,
-    typename T9 = char,
-    typename T10 = char>
-struct AlignedCharArrayUnion
-    : AlignedCharArray<
-          alignof(detail::AlignerImpl<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>),
-          sizeof(::at::detail::
-                     SizerImpl<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>)> {};
+template <typename T1,
+          typename T2 = char, typename T3 = char, typename T4 = char,
+          typename T5 = char, typename T6 = char, typename T7 = char,
+          typename T8 = char, typename T9 = char, typename T10 = char>
+struct AlignedCharArrayUnion : AlignedCharArray<
+    alignof(detail::AlignerImpl<T1, T2, T3, T4, T5,
+                                      T6, T7, T8, T9, T10>),
+    sizeof(::at::detail::SizerImpl<T1, T2, T3, T4, T5,
+                                     T6, T7, T8, T9, T10>)> {
+};
 } // end namespace at
diff --git a/aten/src/ATen/Allocator.h b/aten/src/ATen/Allocator.h
index 26989a7ea7fbed..c1c78102a0fef8 100644
--- a/aten/src/ATen/Allocator.h
+++ b/aten/src/ATen/Allocator.h
@@ -6,7 +6,7 @@
 #include <ATen/Error.h>
 #include <ATen/Retainable.h>
 #include <ATen/Device.h>
-#include <ATen/core/UniqueVoidPtr.h>
+#include <ATen/detail/UniqueVoidPtr.h>
 
 namespace at {
 
diff --git a/aten/src/ATen/ArrayRef.cpp b/aten/src/ATen/ArrayRef.cpp
deleted file mode 100644
index 2a5d1f7a7cb595..00000000000000
--- a/aten/src/ATen/ArrayRef.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include <ATen/ArrayRef.h>
diff --git a/aten/src/ATen/ArrayRef.h b/aten/src/ATen/ArrayRef.h
index f52a5fcf1c2c58..df144025578c6b 100644
--- a/aten/src/ATen/ArrayRef.h
+++ b/aten/src/ATen/ArrayRef.h
@@ -1,2 +1,192 @@
+//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// ATen: modified from llvm::ArrayRef.
+// removed llvm-specific functionality
+// removed some implicit const -> non-const conversions that rely on
+// complicated std::enable_if meta-programming
+// removed a bunch of slice variants for simplicity...
+
 #pragma once
-#include <ATen/core/ArrayRef.h>
+
+#include <ATen/Error.h>
+#include <ATen/SmallVector.h>
+
+#include <array>
+#include <iterator>
+#include <vector>
+
+namespace at {
+  /// ArrayRef - Represent a constant reference to an array (0 or more elements
+  /// consecutively in memory), i.e. a start pointer and a length.  It allows
+  /// various APIs to take consecutive elements easily and conveniently.
+  ///
+  /// This class does not own the underlying data, it is expected to be used in
+  /// situations where the data resides in some other buffer, whose lifetime
+  /// extends past that of the ArrayRef. For this reason, it is not in general
+  /// safe to store an ArrayRef.
+  ///
+  /// This is intended to be trivially copyable, so it should be passed by
+  /// value.
+  template<typename T>
+  class ArrayRef {
+  public:
+    typedef const T *iterator;
+    typedef const T *const_iterator;
+    typedef size_t size_type;
+
+    typedef std::reverse_iterator<iterator> reverse_iterator;
+
+  private:
+    /// The start of the array, in an external buffer.
+    const T *Data;
+
+    /// The number of elements.
+    size_type Length;
+
+  public:
+    /// @name Constructors
+    /// @{
+
+    /// Construct an empty ArrayRef.
+    /*implicit*/ ArrayRef() : Data(nullptr), Length(0) {}
+
+    /// Construct an ArrayRef from a single element.
+    /*implicit*/ ArrayRef(const T &OneElt)
+      : Data(&OneElt), Length(1) {}
+
+    /// Construct an ArrayRef from a pointer and length.
+    /*implicit*/ ArrayRef(const T *data, size_t length)
+      : Data(data), Length(length) {}
+
+    /// Construct an ArrayRef from a range.
+    ArrayRef(const T *begin, const T *end)
+      : Data(begin), Length(end - begin) {}
+
+    /// Construct an ArrayRef from a SmallVector. This is templated in order to
+    /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
+    /// copy-construct an ArrayRef.
+    template<typename U>
+    /*implicit*/ ArrayRef(const SmallVectorTemplateCommon<T, U> &Vec)
+      : Data(Vec.data()), Length(Vec.size()) {
+    }
+
+    /// Construct an ArrayRef from a std::vector.
+    template<typename A>
+    /*implicit*/ ArrayRef(const std::vector<T, A> &Vec)
+      : Data(Vec.data()), Length(Vec.size()) {}
+
+    /// Construct an ArrayRef from a std::array
+    template <size_t N>
+    /*implicit*/ constexpr ArrayRef(const std::array<T, N> &Arr)
+        : Data(Arr.data()), Length(N) {}
+
+    /// Construct an ArrayRef from a C array.
+    template <size_t N>
+    /*implicit*/ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
+
+    /// Construct an ArrayRef from a std::initializer_list.
+    /*implicit*/ ArrayRef(const std::initializer_list<T> &Vec)
+    : Data(Vec.begin() == Vec.end() ? (T*)nullptr : Vec.begin()),
+      Length(Vec.size()) {}
+
+    /// @}
+    /// @name Simple Operations
+    /// @{
+
+    const_iterator begin() const { return Data; }
+    const_iterator end() const { return Data + Length; }
+
+    reverse_iterator rbegin() const { return reverse_iterator(end()); }
+    reverse_iterator rend() const { return reverse_iterator(begin()); }
+
+    /// empty - Check if the array is empty.
+    bool empty() const { return Length == 0; }
+
+    const T *data() const { return Data; }
+
+    /// size - Get the array size.
+    size_t size() const { return Length; }
+
+    /// front - Get the first element.
+    const T &front() const {
+      AT_CHECK(!empty(), "ArrayRef: attempted to access front() of empty list");
+      return Data[0];
+    }
+
+    /// back - Get the last element.
+    const T &back() const {
+      AT_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
+      return Data[Length-1];
+    }
+
+    /// equals - Check for element-wise equality.
+    bool equals(ArrayRef RHS) const {
+      if (Length != RHS.Length)
+        return false;
+      return std::equal(begin(), end(), RHS.begin());
+    }
+
+    /// slice(n, m) - Chop off the first N elements of the array, and keep M
+    /// elements in the array.
+    ArrayRef<T> slice(size_t N, size_t M) const {
+      AT_CHECK(N+M <= size(), "ArrayRef: invalid slice, ", N, " + ", M, " is not <= ", size());
+      return ArrayRef<T>(data()+N, M);
+    }
+
+    /// slice(n) - Chop off the first N elements of the array.
+    ArrayRef<T> slice(size_t N) const { return slice(N, size() - N); }
+
+    /// @}
+    /// @name Operator Overloads
+    /// @{
+    const T &operator[](size_t Index) const {
+      return Data[Index];
+    }
+
+    /// Vector compatibility
+    const T &at(size_t Index) const {
+      AT_CHECK(Index < Length, "ArrayRef: invalid index ", Index, " for length ", Length);
+      return Data[Index];
+    }
+
+    /// Disallow accidental assignment from a temporary.
+    ///
+    /// The declaration here is extra complicated so that "arrayRef = {}"
+    /// continues to select the move assignment operator.
+    template <typename U>
+    typename std::enable_if<std::is_same<U, T>::value, ArrayRef<T>>::type &
+    operator=(U &&Temporary) = delete;
+
+    /// Disallow accidental assignment from a temporary.
+    ///
+    /// The declaration here is extra complicated so that "arrayRef = {}"
+    /// continues to select the move assignment operator.
+    template <typename U>
+    typename std::enable_if<std::is_same<U, T>::value, ArrayRef<T>>::type &
+    operator=(std::initializer_list<U>) = delete;
+
+    /// @}
+    /// @name Expensive Operations
+    /// @{
+    std::vector<T> vec() const {
+      return std::vector<T>(Data, Data+Length);
+    }
+
+    /// @}
+    /// @name Conversion operators
+    /// @{
+    operator std::vector<T>() const {
+      return std::vector<T>(Data, Data+Length);
+    }
+
+    /// @}
+  };
+
+} // end namespace at
diff --git a/aten/src/ATen/core/Backtrace.cpp b/aten/src/ATen/Backtrace.cpp
similarity index 92%
rename from aten/src/ATen/core/Backtrace.cpp
rename to aten/src/ATen/Backtrace.cpp
index 7914489d50ece3..a8e062051ee633 100644
--- a/aten/src/ATen/core/Backtrace.cpp
+++ b/aten/src/ATen/Backtrace.cpp
@@ -1,5 +1,5 @@
-#include <ATen/core/Backtrace.h>
-#include <ATen/core/optional.h>
+#include <ATen/optional.h>
+#include <ATen/Backtrace.h>
 
 #include <functional>
 #include <memory>
@@ -7,30 +7,18 @@
 #include <string>
 #include <vector>
 
-#if defined(__ANDROID__)
-#define AT_CORE_MOBILE 1
-#elif (                   \
-    defined(__APPLE__) && \
-    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
-#define AT_CORE_MOBILE 1
-#else
-#define AT_CORE_MOBILE 0
-#endif
-
-#if !AT_CORE_MOBILE && !defined(_WIN32) && !defined(__EMSCRIPTEN__)
-#define SUPPORTS_BACKTRACE 1
-#else
-#define SUPPORTS_BACKTRACE 0
-#endif
-
-#if SUPPORTS_BACKTRACE
+#if !defined(_WIN32) && !defined(__EMSCRIPTEN__)
 #include <cxxabi.h>
 #include <execinfo.h>
 #endif // !defined(_WIN32)
 
 namespace at {
-
-#if SUPPORTS_BACKTRACE
+#if defined(_MSC_VER)
+// Windows does not have cxxabi.h, so we will simply return the original.
+std::string demangle(const char* name) {
+  return std::string(name);
+}
+#elif !defined(__EMSCRIPTEN__)
 std::string demangle(const char* name) {
   int status = -1;
 
@@ -57,10 +45,6 @@ std::string demangle(const char* name) {
     return name;
   }
 }
-#else
-std::string demangle(const char* name) {
-  return std::string(name);
-}
 #endif
 
 // TODO: This backtrace retrieval can be implemented on Windows via the Windows
@@ -68,7 +52,8 @@ std::string demangle(const char* name) {
 // https://stackoverflow.com/questions/5693192/win32-backtrace-from-c-code
 // https://stackoverflow.com/questions/26398064/counterpart-to-glibcs-backtrace-and-backtrace-symbols-on-windows
 // https://msdn.microsoft.com/en-us/library/windows/desktop/bb204633%28v=vs.85%29.aspx.
-#if SUPPORTS_BACKTRACE
+#if !defined(_WIN32) && !defined(__EMSCRIPTEN__)
+
 namespace {
 
 struct FrameInformation {
@@ -158,13 +143,14 @@ at::optional<FrameInformation> parse_frame_information(
 }
 
 } // anonymous namespace
-#endif // SUPPORTS_BACKTRACE
+
+#endif // !defined(_WIN32)
 
 std::string get_backtrace(
     size_t frames_to_skip,
     size_t maximum_number_of_frames,
     bool skip_python_frames) {
-#if SUPPORTS_BACKTRACE
+#if !defined(_WIN32) && !defined(__EMSCRIPTEN__)
 
   // We always skip this frame (backtrace).
   frames_to_skip += 1;
@@ -235,9 +221,10 @@ std::string get_backtrace(
   }
 
   return stream.str();
-#else // !SUPPORTS_BACKTRACE
+
+#else
+
   return "(no backtrace available)";
-#endif // SUPPORTS_BACKTRACE
+#endif
 }
-
 } // namespace at
diff --git a/aten/src/ATen/Backtrace.h b/aten/src/ATen/Backtrace.h
index bdef9f4a9de439..347c430d61b75c 100644
--- a/aten/src/ATen/Backtrace.h
+++ b/aten/src/ATen/Backtrace.h
@@ -1,2 +1,28 @@
 #pragma once
-#include <ATen/core/Backtrace.h>
+
+#include <cstddef>
+#include <string>
+#include <typeinfo>
+
+#include <ATen/ATenGeneral.h>
+
+namespace at {
+/// Utility to demangle a C++ symbol name.
+AT_API std::string demangle(const char* name);
+
+/// Returns the printable name of the type.
+template <typename T>
+inline const char* demangle_type() {
+#ifdef __GXX_RTTI
+  static const std::string name = demangle(typeid(T).name());
+  return name.c_str();
+#else // __GXX_RTTI
+  return "(RTTI disabled, cannot show name)";
+#endif // __GXX_RTTI
+}
+
+AT_API std::string get_backtrace(
+    size_t frames_to_skip = 0,
+    size_t maximum_number_of_frames = 64,
+    bool skip_python_frames = true);
+} // namespace at
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 25a2e6d8b501f0..562910ad86a298 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -44,7 +44,6 @@ CONFIGURE_FILE(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig
 # NB: If you edit these globs, you'll have to update setup.py package_data as well
 FILE(GLOB base_h "*.h" "detail/*.h")
 FILE(GLOB base_cpp "*.cpp" "detail/*.cpp")
-add_subdirectory(core)
 FILE(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh")
 FILE(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")
 FILE(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu")
@@ -63,7 +62,7 @@ FILE(GLOB native_cuda_cpp "native/cuda/*.cpp")
 FILE(GLOB native_mkl_cpp "native/mkl/*.cpp")
 FILE(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
 
-set(all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp})
+set(all_cpu_cpp ${base_cpp} ${native_cpp} ${native_sparse_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${cpu_kernel_cpp})
 if(AT_MKL_ENABLED)
   set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp})
 endif()
@@ -394,7 +393,7 @@ INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
   DESTINATION "${AT_INSTALL_SHARE_DIR}/cmake/ATen")
 
 # https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake
-FOREACH(HEADER ${base_h} ${ATen_CORE_HEADERS} ${cuda_h} ${cudnn_h})
+FOREACH(HEADER ${base_h} ${cuda_h} ${cudnn_h})
   string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" HEADER_SUB ${HEADER})
   GET_FILENAME_COMPONENT(DIR ${HEADER_SUB} DIRECTORY)
   INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/${DIR})
@@ -445,7 +444,6 @@ if (NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
 endif()
 
 # Pass source, includes, and libs to parent
-set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
 set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_SRCS ${ATen_CUDA_SRCS} PARENT_SCOPE)
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h
index ef370ea6e0bc30..2db2786b1c66cd 100644
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@@ -109,8 +109,8 @@ struct strided_tensor_iter {
       : data_(tensor.data<T>()),
         dim_(tensor.ndimension()),
         counter_(dim_, 0),
-        sizes_(tensor.sizes().vec()),
-        strides_(tensor.strides().vec()) {
+        sizes_(tensor.sizes()),
+        strides_(tensor.strides()) {
     _setup_arrays(tensor, this);
   }
 };
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index d153e6bc6ada00..59f6ff755ee3f1 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -37,11 +37,8 @@ Context::Context()
   Type::registerCPU(this);
 }
 
-// NB: Ensure that globalContext is initialized before we load
-// variable hooks, otherwise we will deadlock.  Regardless, the
-// deadlock is bad, and being tracked at https://github.com/pytorch/pytorch/issues/9784
-static Context globalContext_;
 Context & globalContext() {
+  static Context globalContext_;
   return globalContext_;
 }
 
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 7d3fdd1cc2d4af..309c4be2e651dd 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -9,9 +9,6 @@
 #include "ATen/detail/CUDAHooksInterface.h"
 #include "ATen/CUDAStream.h"
 
-// This is temporary
-#include "ATen/core/ATenCoreTest.h"
-
 #include <memory>
 #include <mutex>
 #include <cstdint>
diff --git a/aten/src/ATen/core/Error.cpp b/aten/src/ATen/Error.cpp
similarity index 64%
rename from aten/src/ATen/core/Error.cpp
rename to aten/src/ATen/Error.cpp
index 35ba7d644e109b..1261fbe0295d6c 100644
--- a/aten/src/ATen/core/Error.cpp
+++ b/aten/src/ATen/Error.cpp
@@ -1,5 +1,5 @@
-#include <ATen/core/Error.h>
-#include <ATen/core/Backtrace.h>
+#include <ATen/Error.h>
+#include <ATen/Backtrace.h>
 
 #include <iostream>
 #include <string>
@@ -11,13 +11,9 @@ std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
 }
 
 Error::Error(SourceLocation source_location, std::string err)
-    : what_without_backtrace_(err),
-      what_(
-          str(err,
-              " (",
-              source_location,
-              ")\n",
-              get_backtrace(/*frames_to_skip=*/2))) {}
+  : what_without_backtrace_(err)
+  , what_(str(err, " (", source_location, ")\n", get_backtrace(/*frames_to_skip=*/2)))
+  {}
 
 void Warning::warn(SourceLocation source_location, std::string msg) {
   warning_handler_(source_location, msg.c_str());
@@ -27,9 +23,7 @@ void Warning::set_warning_handler(handler_t handler) {
   warning_handler_ = handler;
 }
 
-void Warning::print_warning(
-    const SourceLocation& source_location,
-    const char* msg) {
+void Warning::print_warning(const SourceLocation& source_location, const char* msg) {
   std::cerr << "Warning: " << msg << " (" << source_location << ")\n";
 }
 
diff --git a/aten/src/ATen/Error.h b/aten/src/ATen/Error.h
index 2a184d4ecbd5ea..5a41eb7c74e7cb 100644
--- a/aten/src/ATen/Error.h
+++ b/aten/src/ATen/Error.h
@@ -1,2 +1,131 @@
 #pragma once
-#include <ATen/core/Error.h>
+
+#include <ATen/ATenGeneral.h> // for AT_API
+#include <ATen/optional.h>
+
+#include <cstddef>
+#include <exception>
+#include <ostream>
+#include <sstream>
+#include <string>
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#define __func__ __FUNCTION__
+#endif
+
+namespace at {
+
+namespace detail {
+
+inline std::ostream& _str(std::ostream& ss) { return ss; }
+
+template <typename T>
+inline std::ostream& _str(std::ostream& ss, const T& t) {
+  ss << t;
+  return ss;
+}
+
+template <typename T, typename... Args>
+inline std::ostream&
+_str(std::ostream& ss, const T& t, const Args&... args) {
+  return _str(_str(ss, t), args...);
+}
+
+} // namespace detail
+
+// Convert a list of string-like arguments into a single string.
+template <typename... Args>
+inline std::string str(const Args&... args) {
+  std::ostringstream ss;
+  detail::_str(ss, args...);
+  return ss.str();
+}
+
+// Specializations for already-a-string types.
+template <>
+inline std::string str(const std::string& str) {
+  return str;
+}
+inline std::string str(const char* c_str) {
+  return c_str;
+}
+
+/// Represents a location in source code (for debugging).
+struct SourceLocation {
+  const char* function;
+  const char* file;
+  uint32_t line;
+};
+
+std::ostream& operator<<(std::ostream& out, const SourceLocation& loc);
+
+/// The primary ATen error class.
+/// Provides a complete error message with source location information via
+/// `what()`, and a more concise message via `what_without_backtrace()`. Should
+/// primarily be used with the `AT_ERROR` macro.
+///
+/// NB: at::Error is handled specially by the default torch to suppress the
+/// backtrace, see torch/csrc/Exceptions.h
+class AT_API Error : public std::exception {
+  std::string what_without_backtrace_;
+  std::string what_;
+
+public:
+  Error(SourceLocation source_location, std::string err);
+
+  /// Returns the complete error message, including the source location.
+  const char* what() const noexcept override {
+    return what_.c_str();
+  }
+
+  /// Returns only the error message string, without source location.
+  const char* what_without_backtrace() const noexcept {
+    return what_without_backtrace_.c_str();
+  }
+};
+
+class AT_API Warning {
+  using handler_t = void(*)(const SourceLocation& source_location, const char* msg);
+
+public:
+  /// Issue a warning with a given message. Dispatched to the current
+  /// warning handler.
+  static void warn(SourceLocation source_location, std::string msg);
+
+  /// Sets the global warning handler. This is not thread-safe, so it should
+  /// generally be called once during initialization.
+  static void set_warning_handler(handler_t handler);
+
+  /// The default warning handler. Prints the message to stderr.
+  static void print_warning(const SourceLocation& source_location, const char* msg);
+
+private:
+  static handler_t warning_handler_;
+};
+
+
+} // namespace at
+
+// TODO: variants that print the expression tested and thus don't require strings
+// TODO: CAFFE_ENFORCE_WITH_CALLER style macro
+
+#define AT_ERROR(...) \
+  throw at::Error({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__))
+
+#define AT_WARN(...) \
+  at::Warning::warn({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__))
+
+#define AT_ASSERT(cond) \
+  if (!(cond)) {             \
+    AT_ERROR(#cond " ASSERT FAILED at ", __FILE__, ":", __LINE__, ", please report a bug to PyTorch.");   \
+  }
+
+#define AT_ASSERTM(cond, ...) \
+  if (!(cond)) {             \
+    AT_ERROR(at::str(#cond, " ASSERT FAILED at ", __FILE__, ":", __LINE__, ", please report a bug to PyTorch. ", __VA_ARGS__));   \
+  }
+
+#define AT_CHECK(cond, ...) \
+  if (!(cond)) {             \
+    AT_ERROR(at::str(__VA_ARGS__));   \
+  }
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 934be4093b7257..35125cfa6751bb 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -111,7 +111,7 @@ inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
     if (!to_expand[i].defined()) {
       continue;
     } else if (first) {
-      sizes = to_expand[i].sizes().vec();
+      sizes = to_expand[i].sizes();
       first = false;
     } else {
       sizes = infer_size(sizes, to_expand[i].sizes());
diff --git a/aten/src/ATen/Half-inl.h b/aten/src/ATen/Half-inl.h
new file mode 100644
index 00000000000000..e5563faca3ab33
--- /dev/null
+++ b/aten/src/ATen/Half-inl.h
@@ -0,0 +1,168 @@
+#pragma once
+
+#include "ATen/ATenGeneral.h"
+#include <cstring>
+#include <limits>
+
+#ifdef __CUDACC__
+#include <cuda_fp16.h>
+#endif
+
+namespace at {
+
+/// Constructors
+
+inline AT_HOSTDEVICE Half::Half(float value) {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  x = __half_as_short(__float2half(value));
+#else
+  x = detail::float2halfbits(value);
+#endif
+}
+
+/// Implicit conversions
+
+inline AT_HOSTDEVICE Half::operator float() const {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  return __half2float(*reinterpret_cast<const __half*>(&x));
+#else
+  return detail::halfbits2float(x);
+#endif
+}
+
+#ifdef __CUDACC__
+inline AT_HOSTDEVICE Half::Half(const __half& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline AT_HOSTDEVICE Half::operator __half() const {
+  return *reinterpret_cast<const __half*>(&x);
+}
+#endif
+
+/// Arithmetic
+
+inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) {
+  return (float)a + (float)b;
+}
+
+inline AT_HOSTDEVICE Half operator-(const Half& a, const Half& b) {
+  return (float)a - (float)b;
+}
+
+inline AT_HOSTDEVICE Half operator*(const Half& a, const Half& b) {
+  return (float)a * (float)b;
+}
+
+inline AT_HOSTDEVICE Half operator/(const Half& a, const Half& b) {
+  return (float)a / (float)b;
+}
+
+inline AT_HOSTDEVICE Half operator-(const Half& a) {
+  return -(float)a;
+}
+
+inline AT_HOSTDEVICE Half& operator+=(Half& a, const Half& b) {
+  a = a + b;
+  return a;
+}
+
+inline AT_HOSTDEVICE Half& operator-=(Half& a, const Half& b) {
+  a = a - b;
+  return a;
+}
+
+inline AT_HOSTDEVICE Half& operator*=(Half& a, const Half& b) {
+  a = a * b;
+  return a;
+}
+
+inline AT_HOSTDEVICE Half& operator/=(Half& a, const Half& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline AT_HOSTDEVICE float operator+(Half a, float b) { return (float)a + b; }
+inline AT_HOSTDEVICE float operator-(Half a, float b) { return (float)a - b; }
+inline AT_HOSTDEVICE float operator*(Half a, float b) { return (float)a * b; }
+inline AT_HOSTDEVICE float operator/(Half a, float b) { return (float)a / b; }
+
+inline AT_HOSTDEVICE float operator+(float a, Half b) { return a + (float)b; }
+inline AT_HOSTDEVICE float operator-(float a, Half b) { return a - (float)b; }
+inline AT_HOSTDEVICE float operator*(float a, Half b) { return a * (float)b; }
+inline AT_HOSTDEVICE float operator/(float a, Half b) { return a / (float)b; }
+
+inline AT_HOSTDEVICE float& operator+=(float& a, const Half& b) { return a += (float)b; }
+inline AT_HOSTDEVICE float& operator-=(float& a, const Half& b) { return a -= (float)b; }
+inline AT_HOSTDEVICE float& operator*=(float& a, const Half& b) { return a *= (float)b; }
+inline AT_HOSTDEVICE float& operator/=(float& a, const Half& b) { return a /= (float)b; }
+
+/// Arithmetic with doubles
+
+inline AT_HOSTDEVICE double operator+(Half a, double b) { return (double)a + b; }
+inline AT_HOSTDEVICE double operator-(Half a, double b) { return (double)a - b; }
+inline AT_HOSTDEVICE double operator*(Half a, double b) { return (double)a * b; }
+inline AT_HOSTDEVICE double operator/(Half a, double b) { return (double)a / b; }
+
+inline AT_HOSTDEVICE double operator+(double a, Half b) { return a + (double)b; }
+inline AT_HOSTDEVICE double operator-(double a, Half b) { return a - (double)b; }
+inline AT_HOSTDEVICE double operator*(double a, Half b) { return a * (double)b; }
+inline AT_HOSTDEVICE double operator/(double a, Half b) { return a / (double)b; }
+
+/// Arithmetic with ints
+
+inline AT_HOSTDEVICE Half operator+(Half a, int b) { return a + (Half)b; }
+inline AT_HOSTDEVICE Half operator-(Half a, int b) { return a - (Half)b; }
+inline AT_HOSTDEVICE Half operator*(Half a, int b) { return a * (Half)b; }
+inline AT_HOSTDEVICE Half operator/(Half a, int b) { return a / (Half)b; }
+
+inline AT_HOSTDEVICE Half operator+(int a, Half b) { return (Half)a + b; }
+inline AT_HOSTDEVICE Half operator-(int a, Half b) { return (Half)a - b; }
+inline AT_HOSTDEVICE Half operator*(int a, Half b) { return (Half)a * b; }
+inline AT_HOSTDEVICE Half operator/(int a, Half b) { return (Half)a / b; }
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from at::Half to float.
+
+} // namespace at
+
+namespace std {
+
+template<> class numeric_limits<at::Half> {
+ public:
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
+  static constexpr auto has_denorm_loss = numeric_limits<float>::has_denorm_loss;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 11;
+  static constexpr int digits10 = 3;
+  static constexpr int max_digits10 = 5;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -13;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent = 16;
+  static constexpr int max_exponent10 = 4;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before = numeric_limits<float>::tinyness_before;
+  static constexpr at::Half min() { return at::Half(0x0400, at::Half::from_bits); }
+  static constexpr at::Half lowest() { return at::Half(0xFBFF, at::Half::from_bits); }
+  static constexpr at::Half max() { return at::Half(0x7BFF, at::Half::from_bits); }
+  static constexpr at::Half epsilon() { return at::Half(0x1400, at::Half::from_bits); }
+  static constexpr at::Half round_error() { return at::Half(0x3800, at::Half::from_bits); }
+  static constexpr at::Half infinity() { return at::Half(0x7C00, at::Half::from_bits); }
+  static constexpr at::Half quiet_NaN() { return at::Half(0x7E00, at::Half::from_bits); }
+  static constexpr at::Half signaling_NaN() { return at::Half(0x7D00, at::Half::from_bits); }
+  static constexpr at::Half denorm_min() { return at::Half(0x0001, at::Half::from_bits); }
+};
+
+} // namespace std
diff --git a/aten/src/ATen/Half.cpp b/aten/src/ATen/Half.cpp
new file mode 100644
index 00000000000000..68f80a56ea8195
--- /dev/null
+++ b/aten/src/ATen/Half.cpp
@@ -0,0 +1,34 @@
+#include "ATen/Half.h"
+
+#include "ATen/Tensor.h"
+#include "ATen/Context.h"
+
+#include <TH/TH.h>
+#include <iostream>
+
+namespace at {
+
+static_assert(std::is_standard_layout<Half>::value, "at::Half must be standard layout.");
+
+namespace detail {
+
+float halfbits2float(unsigned short bits) {
+  float value;
+  TH_halfbits2float(&bits, &value);
+  return value;
+}
+
+unsigned short float2halfbits(float value) {
+  unsigned short bits;
+  TH_float2halfbits(&value, &bits);
+  return bits;
+}
+
+} // namespace detail
+
+std::ostream& operator<<(std::ostream & out, const Half& value) {
+  out << (float)value;
+  return out;
+}
+
+} // namespace at
diff --git a/aten/src/ATen/Half.h b/aten/src/ATen/Half.h
index 21941116f19e82..b7ac47e4fda79a 100644
--- a/aten/src/ATen/Half.h
+++ b/aten/src/ATen/Half.h
@@ -1,2 +1,120 @@
 #pragma once
-#include <ATen/core/Half.h>
+
+/// Defines the Half type (half-precision floating-point) including conversions
+/// to standard C types and basic arithmetic operations. Note that arithmetic
+/// operations are implemented by converting to floating point and
+/// performing the operation in float32, instead of using CUDA half intrinisics.
+/// Most uses of this type within ATen are memory bound, including the
+/// element-wise kernels, and the half intrinisics aren't efficient on all GPUs.
+/// If you are writing a compute bound kernel, you can use the CUDA half
+/// intrinsics directly on the Half type from device code.
+
+#include "ATen/ATenGeneral.h"
+
+#include <limits>
+#include <string>
+#include <cstdint>
+#include <stdexcept>
+#include <utility>
+#include <cmath>
+#include <iosfwd>
+
+#ifdef __CUDACC__
+#include <cuda_fp16.h>
+#endif
+
+#ifndef AT_HOSTDEVICE
+  #ifdef __CUDACC__
+    #define AT_HOSTDEVICE __host__ __device__
+  #else
+    #define AT_HOSTDEVICE
+  #endif
+#endif
+
+namespace at {
+
+namespace detail {
+
+AT_API float halfbits2float(unsigned short bits);
+AT_API unsigned short float2halfbits(float value);
+
+}
+
+struct alignas(2) Half {
+  unsigned short x;
+
+  struct from_bits_t {};
+  static constexpr from_bits_t from_bits = from_bits_t();
+
+  // HIP wants __host__ __device__ tag, CUDA does not
+#ifdef __HIP_PLATFORM_HCC__
+  AT_HOSTDEVICE Half() = default;
+#else
+  Half() = default;
+#endif
+
+  constexpr AT_HOSTDEVICE Half(unsigned short bits, from_bits_t) : x(bits) {};
+  inline AT_HOSTDEVICE Half(float value);
+  inline AT_HOSTDEVICE operator float() const;
+
+#ifdef __CUDACC__
+  inline AT_HOSTDEVICE Half(const __half& value);
+  inline AT_HOSTDEVICE operator __half() const;
+#endif
+};
+
+template<typename To, typename From> To convert(From f) {
+  return static_cast<To>(f);
+}
+
+// skip isnan and isinf check for integral types
+template<typename To, typename From>
+typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(From f) {
+  using limit = std::numeric_limits<To>;
+  if (!limit::is_signed && std::numeric_limits<From>::is_signed) {
+    // allow for negative numbers to wrap using two's complement arithmetic.
+    // For example, with uint8, this allows for `a - b` to be treated as
+    // `a + 255 * b`.
+    return f > limit::max() || (f < 0 && -(uint64_t)f > limit::max());
+  } else {
+    return f < limit::lowest() || f > limit::max();
+  }
+}
+
+template<typename To, typename From>
+typename std::enable_if<!std::is_integral<From>::value, bool>::type overflows(From f) {
+  using limit = std::numeric_limits<To>;
+  if (limit::has_infinity && std::isinf((double)f)) {
+    return false;
+  }
+  if (!limit::has_quiet_NaN && (f != f)) {
+    return true;
+  }
+  return f < limit::lowest() || f > limit::max();
+}
+
+template<typename To, typename From> To checked_convert(From f, const char* name) {
+  if (overflows<To, From>(f)) {
+    std::string msg = "value cannot be converted to type ";
+    msg += name;
+    msg += " without overflow: ";
+    msg += std::to_string(f);
+    throw std::domain_error(std::move(msg));
+  }
+  return convert<To, From>(f);
+}
+
+template<typename To, typename From>
+To HalfFix(From h) {
+  To ret;
+  ret.x = h.x;
+  return ret;
+}
+
+AT_API std::ostream& operator<<(std::ostream & out, const Half& value);
+
+} // namespace at
+
+#include "Half-inl.h"
+
+#undef AT_HOSTDEVICE
diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
index 6aadd62eb1d3fd..794d8e5f8c31a9 100644
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@@ -37,9 +37,7 @@ inline void parallel_for(
       f(begin_tid, std::min(end, chunk_size + begin_tid));
   }
 #else
-  if (begin < end) {
-    f(begin, end);
-  }
+  f(begin, end);
 #endif
 }
 
diff --git a/aten/src/ATen/Scalar.h b/aten/src/ATen/Scalar.h
index f0b84d67554c02..e80d467b138ac3 100644
--- a/aten/src/ATen/Scalar.h
+++ b/aten/src/ATen/Scalar.h
@@ -10,6 +10,7 @@
 #include "ATen/Half.h"
 #include "ATen/ScalarType.h"
 #include "ATen/TensorBase.h"
+#include "ATen/Utils.h"
 
 
 namespace at {
diff --git a/aten/src/ATen/ScalarType.h b/aten/src/ATen/ScalarType.h
index 3651aef60e3e1e..f7c9243a89df2a 100644
--- a/aten/src/ATen/ScalarType.h
+++ b/aten/src/ATen/ScalarType.h
@@ -10,16 +10,16 @@
 namespace at {
 
 // NB: Order matters for this macro; it is relied upon in
-// _promoteTypesLookup and the serialization format.
+// _promoteTypesLookup and probably other places.
 #define AT_FORALL_SCALAR_TYPES(_) \
-_(uint8_t,Byte,i)  /* 0 */ \
-_(int8_t,Char,i)   /* 1 */ \
-_(int16_t,Short,i) /* 2 */ \
-_(int,Int,i)       /* 3 */ \
-_(int64_t,Long,i)  /* 4 */ \
-_(at::Half,Half,d) /* 5 */ \
-_(float,Float,d)   /* 6 */ \
-_(double,Double,d) /* 7 */
+_(uint8_t,Byte,i) \
+_(int8_t,Char,i) \
+_(int16_t,Short,i) \
+_(int,Int,i) \
+_(int64_t,Long,i) \
+_(at::Half,Half,d) \
+_(float,Float,d) \
+_(double,Double,d)
 
 #define AT_FORALL_SCALAR_TYPES_EXCEPT_HALF(_) \
 _(uint8_t,Byte,i) \
@@ -35,7 +35,7 @@ enum class ScalarType {
   n,
   AT_FORALL_SCALAR_TYPES(DEFINE_ENUM)
 #undef DEFINE_ENUM
-  Undefined, // 8
+  Undefined,
   NumOptions
 };
 
diff --git a/aten/src/ATen/core/SmallVector.cpp b/aten/src/ATen/SmallVector.cpp
similarity index 87%
rename from aten/src/ATen/core/SmallVector.cpp
rename to aten/src/ATen/SmallVector.cpp
index 976809c5b50931..59095a2809c7a8 100644
--- a/aten/src/ATen/core/SmallVector.cpp
+++ b/aten/src/ATen/SmallVector.cpp
@@ -14,22 +14,20 @@
 // ATen: modified from llvm::SmallVector.
 // replaced report_bad_alloc_error with std::bad_alloc
 
-#include <ATen/core/SmallVector.h>
+#include "SmallVector.h"
 
 namespace at {
 
 /// grow_pod - This is an implementation of the grow() method which only works
 /// on POD-like datatypes and is out of line to reduce code duplication.
-void SmallVectorBase::grow_pod(
-    void* FirstEl,
-    size_t MinSizeInBytes,
-    size_t TSize) {
+void SmallVectorBase::grow_pod(void *FirstEl, size_t MinSizeInBytes,
+                               size_t TSize) {
   size_t CurSizeBytes = size_in_bytes();
   size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow.
   if (NewCapacityInBytes < MinSizeInBytes)
     NewCapacityInBytes = MinSizeInBytes;
 
-  void* NewElts;
+  void *NewElts;
   if (BeginX == FirstEl) {
     NewElts = malloc(NewCapacityInBytes);
     if (NewElts == nullptr)
@@ -44,9 +42,9 @@ void SmallVectorBase::grow_pod(
       throw std::bad_alloc();
   }
 
-  this->EndX = (char*)NewElts + CurSizeBytes;
+  this->EndX = (char*)NewElts+CurSizeBytes;
   this->BeginX = NewElts;
   this->CapacityX = (char*)this->BeginX + NewCapacityInBytes;
 }
 
-} // namespace at
+}
diff --git a/aten/src/ATen/SmallVector.h b/aten/src/ATen/SmallVector.h
index 1dbaa933c555dd..7c52ef686aa41a 100644
--- a/aten/src/ATen/SmallVector.h
+++ b/aten/src/ATen/SmallVector.h
@@ -1,2 +1,982 @@
+//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SmallVector class.
+//
+//===----------------------------------------------------------------------===//
+
+// ATen: modified from llvm::SmallVector.
+// replaced report_bad_alloc_error with std::bad_alloc
+// replaced isPodLike<T> with AT_IS_TRIVIALLY_COPYABLE
+// replaced iterator_range constructor with inline Container&& constructor
+// removed LLVM_NODISCARD and LLVM_ATTRIBUTE_ALWAYS_INLINE qualifiers
+// removed LLVM_UNLIKELY
+
 #pragma once
-#include <ATen/core/SmallVector.h>
+
+#include "AlignOf.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+#include <ATen/ATenGeneral.h>
+
+#if __GNUG__ && __GNUC__ < 5
+#define AT_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
+#else
+#define AT_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
+#endif
+
+namespace at {
+
+namespace detail {
+
+// From llvm/Support/MathExtras.h
+static inline uint64_t NextPowerOf2(uint64_t A) {
+  A |= (A >> 1);
+  A |= (A >> 2);
+  A |= (A >> 4);
+  A |= (A >> 8);
+  A |= (A >> 16);
+  A |= (A >> 32);
+  return A + 1;
+}
+
+}
+
+/// This is all the non-templated stuff common to all SmallVectors.
+class AT_API SmallVectorBase {
+protected:
+  void *BeginX, *EndX, *CapacityX;
+
+protected:
+  SmallVectorBase(void *FirstEl, size_t Size)
+    : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {}
+
+  /// This is an implementation of the grow() method which only works
+  /// on POD-like data types and is out of line to reduce code duplication.
+  void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize);
+
+public:
+  /// This returns size()*sizeof(T).
+  size_t size_in_bytes() const {
+    return size_t((char*)EndX - (char*)BeginX);
+  }
+
+  /// capacity_in_bytes - This returns capacity()*sizeof(T).
+  size_t capacity_in_bytes() const {
+    return size_t((char*)CapacityX - (char*)BeginX);
+  }
+
+  bool empty() const { return BeginX == EndX; }
+};
+
+/// This is the part of SmallVectorTemplateBase which does not depend on whether
+/// the type T is a POD. The extra dummy template argument is used by ArrayRef
+/// to avoid unnecessarily requiring T to be complete.
+template <typename T, typename = void>
+class SmallVectorTemplateCommon : public SmallVectorBase {
+private:
+  template <typename, unsigned> friend struct SmallVectorStorage;
+
+  // Allocate raw space for N elements of type T.  If T has a ctor or dtor, we
+  // don't want it to be automatically run, so we need to represent the space as
+  // something else.  Use an array of char of sufficient alignment.
+  using U = AlignedCharArrayUnion<T>;
+  U FirstEl;
+  // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
+
+protected:
+  SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {}
+
+  void grow_pod(size_t MinSizeInBytes, size_t TSize) {
+    SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize);
+  }
+
+  /// Return true if this is a smallvector which has not had dynamic
+  /// memory allocated for it.
+  bool isSmall() const {
+    return BeginX == static_cast<const void*>(&FirstEl);
+  }
+
+  /// Put this vector in a state of being small.
+  void resetToSmall() {
+    BeginX = EndX = CapacityX = &FirstEl;
+  }
+
+  void setEnd(T *P) { this->EndX = P; }
+
+public:
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+  using value_type = T;
+  using iterator = T *;
+  using const_iterator = const T *;
+
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+  using reference = T &;
+  using const_reference = const T &;
+  using pointer = T *;
+  using const_pointer = const T *;
+
+  // forward iterator creation methods.
+  iterator begin() { return (iterator)this->BeginX; }
+  const_iterator begin() const { return (const_iterator)this->BeginX; }
+  iterator end() { return (iterator)this->EndX; }
+  const_iterator end() const { return (const_iterator)this->EndX; }
+
+protected:
+  iterator capacity_ptr() { return (iterator)this->CapacityX; }
+  const_iterator capacity_ptr() const { return (const_iterator)this->CapacityX;}
+
+public:
+  // reverse iterator creation methods.
+  reverse_iterator rbegin()            { return reverse_iterator(end()); }
+  const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
+  reverse_iterator rend()              { return reverse_iterator(begin()); }
+  const_reverse_iterator rend() const { return const_reverse_iterator(begin());}
+
+  size_type size() const { return end()-begin(); }
+  size_type max_size() const { return size_type(-1) / sizeof(T); }
+
+  /// Return the total number of elements in the currently allocated buffer.
+  size_t capacity() const { return capacity_ptr() - begin(); }
+
+  /// Return a pointer to the vector's buffer, even if empty().
+  pointer data() { return pointer(begin()); }
+  /// Return a pointer to the vector's buffer, even if empty().
+  const_pointer data() const { return const_pointer(begin()); }
+
+  reference operator[](size_type idx) {
+    assert(idx < size());
+    return begin()[idx];
+  }
+  const_reference operator[](size_type idx) const {
+    assert(idx < size());
+    return begin()[idx];
+  }
+
+  reference front() {
+    assert(!empty());
+    return begin()[0];
+  }
+  const_reference front() const {
+    assert(!empty());
+    return begin()[0];
+  }
+
+  reference back() {
+    assert(!empty());
+    return end()[-1];
+  }
+  const_reference back() const {
+    assert(!empty());
+    return end()[-1];
+  }
+};
+
+/// SmallVectorTemplateBase<isPodLike = false> - This is where we put method
+/// implementations that are designed to work with non-POD-like T's.
+template <typename T, bool isPodLike>
+class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
+protected:
+  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+  static void destroy_range(T *S, T *E) {
+    while (S != E) {
+      --E;
+      E->~T();
+    }
+  }
+
+  /// Move the range [I, E) into the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    std::uninitialized_copy(std::make_move_iterator(I),
+                            std::make_move_iterator(E), Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+    std::uninitialized_copy(I, E, Dest);
+  }
+
+  /// Grow the allocated memory (without initializing new elements), doubling
+  /// the size of the allocated memory. Guarantees space for at least one more
+  /// element, or MinSize more elements if specified.
+  void grow(size_t MinSize = 0);
+
+public:
+  void push_back(const T &Elt) {
+    if (this->EndX >= this->CapacityX)
+      this->grow();
+    ::new ((void*) this->end()) T(Elt);
+    this->setEnd(this->end()+1);
+  }
+
+  void push_back(T &&Elt) {
+    if (this->EndX >= this->CapacityX)
+      this->grow();
+    ::new ((void*) this->end()) T(::std::move(Elt));
+    this->setEnd(this->end()+1);
+  }
+
+  void pop_back() {
+    this->setEnd(this->end()-1);
+    this->end()->~T();
+  }
+};
+
+// Define this out-of-line to dissuade the C++ compiler from inlining it.
+template <typename T, bool isPodLike>
+void SmallVectorTemplateBase<T, isPodLike>::grow(size_t MinSize) {
+  size_t CurCapacity = this->capacity();
+  size_t CurSize = this->size();
+  // Always grow, even from zero.
+  size_t NewCapacity = size_t(detail::NextPowerOf2(CurCapacity+2));
+  if (NewCapacity < MinSize)
+    NewCapacity = MinSize;
+  T *NewElts = static_cast<T*>(malloc(NewCapacity*sizeof(T)));
+  if (NewElts == nullptr)
+    throw std::bad_alloc();
+
+  // Move the elements over.
+  this->uninitialized_move(this->begin(), this->end(), NewElts);
+
+  // Destroy the original elements.
+  destroy_range(this->begin(), this->end());
+
+  // If this wasn't grown from the inline copy, deallocate the old space.
+  if (!this->isSmall())
+    free(this->begin());
+
+  this->setEnd(NewElts+CurSize);
+  this->BeginX = NewElts;
+  this->CapacityX = this->begin()+NewCapacity;
+}
+
+
+/// SmallVectorTemplateBase<isPodLike = true> - This is where we put method
+/// implementations that are designed to work with POD-like T's.
+template <typename T>
+class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
+protected:
+  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+  // No need to do a destroy loop for POD's.
+  static void destroy_range(T *, T *) {}
+
+  /// Move the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    // Just do a copy.
+    uninitialized_copy(I, E, Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+    // Arbitrary iterator types; just use the basic implementation.
+    std::uninitialized_copy(I, E, Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template <typename T1, typename T2>
+  static void uninitialized_copy(
+      T1 *I, T1 *E, T2 *Dest,
+      typename std::enable_if<std::is_same<typename std::remove_const<T1>::type,
+                                           T2>::value>::type * = nullptr) {
+    // Use memcpy for PODs iterated by pointers (which includes SmallVector
+    // iterators): std::uninitialized_copy optimizes to memmove, but we can
+    // use memcpy here. Note that I and E are iterators and thus might be
+    // invalid for memcpy if they are equal.
+    if (I != E)
+      memcpy(Dest, I, (E - I) * sizeof(T));
+  }
+
+  /// Double the size of the allocated memory, guaranteeing space for at
+  /// least one more element or MinSize if specified.
+  void grow(size_t MinSize = 0) {
+    this->grow_pod(MinSize*sizeof(T), sizeof(T));
+  }
+
+public:
+  void push_back(const T &Elt) {
+    if (this->EndX >= this->CapacityX)
+      this->grow();
+    memcpy(this->end(), &Elt, sizeof(T));
+    this->setEnd(this->end()+1);
+  }
+
+  void pop_back() {
+    this->setEnd(this->end()-1);
+  }
+};
+
+/// This class consists of common code factored out of the SmallVector class to
+/// reduce code duplication based on the SmallVector 'N' template parameter.
+template <typename T>
+class SmallVectorImpl : public SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)> {
+  using SuperClass = SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)>;
+
+public:
+  using iterator = typename SuperClass::iterator;
+  using const_iterator = typename SuperClass::const_iterator;
+  using size_type = typename SuperClass::size_type;
+
+protected:
+  // Default ctor - Initialize to empty.
+  explicit SmallVectorImpl(unsigned N)
+    : SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)>(N*sizeof(T)) {
+  }
+
+public:
+  SmallVectorImpl(const SmallVectorImpl &) = delete;
+
+  ~SmallVectorImpl() {
+    // Destroy the constructed elements in the vector.
+    this->destroy_range(this->begin(), this->end());
+
+    // If this wasn't grown from the inline copy, deallocate the old space.
+    if (!this->isSmall())
+      free(this->begin());
+  }
+
+  void clear() {
+    this->destroy_range(this->begin(), this->end());
+    this->EndX = this->BeginX;
+  }
+
+  void resize(size_type N) {
+    if (N < this->size()) {
+      this->destroy_range(this->begin()+N, this->end());
+      this->setEnd(this->begin()+N);
+    } else if (N > this->size()) {
+      if (this->capacity() < N)
+        this->grow(N);
+      auto I = this->end();
+      for (auto E = this->begin() + N; I != E; ++I)
+        new (&*I) T();
+      this->setEnd(this->begin()+N);
+    }
+  }
+
+  void resize(size_type N, const T &NV) {
+    if (N < this->size()) {
+      this->destroy_range(this->begin()+N, this->end());
+      this->setEnd(this->begin()+N);
+    } else if (N > this->size()) {
+      if (this->capacity() < N)
+        this->grow(N);
+      std::uninitialized_fill(this->end(), this->begin()+N, NV);
+      this->setEnd(this->begin()+N);
+    }
+  }
+
+  void reserve(size_type N) {
+    if (this->capacity() < N)
+      this->grow(N);
+  }
+
+  T pop_back_val() {
+    T Result = ::std::move(this->back());
+    this->pop_back();
+    return Result;
+  }
+
+  void swap(SmallVectorImpl &RHS);
+
+  /// Add the specified range to the end of the SmallVector.
+  template <typename in_iter,
+            typename = typename std::enable_if<std::is_convertible<
+                typename std::iterator_traits<in_iter>::iterator_category,
+                std::input_iterator_tag>::value>::type>
+  void append(in_iter in_start, in_iter in_end) {
+    size_type NumInputs = std::distance(in_start, in_end);
+    // Grow allocated space if needed.
+    if (NumInputs > size_type(this->capacity_ptr()-this->end()))
+      this->grow(this->size()+NumInputs);
+
+    // Copy the new elements over.
+    this->uninitialized_copy(in_start, in_end, this->end());
+    this->setEnd(this->end() + NumInputs);
+  }
+
+  /// Add the specified range to the end of the SmallVector.
+  void append(size_type NumInputs, const T &Elt) {
+    // Grow allocated space if needed.
+    if (NumInputs > size_type(this->capacity_ptr()-this->end()))
+      this->grow(this->size()+NumInputs);
+
+    // Copy the new elements over.
+    std::uninitialized_fill_n(this->end(), NumInputs, Elt);
+    this->setEnd(this->end() + NumInputs);
+  }
+
+  void append(std::initializer_list<T> IL) {
+    append(IL.begin(), IL.end());
+  }
+
+  // FIXME: Consider assigning over existing elements, rather than clearing &
+  // re-initializing them - for all assign(...) variants.
+
+  void assign(size_type NumElts, const T &Elt) {
+    clear();
+    if (this->capacity() < NumElts)
+      this->grow(NumElts);
+    this->setEnd(this->begin()+NumElts);
+    std::uninitialized_fill(this->begin(), this->end(), Elt);
+  }
+
+  template <typename in_iter,
+            typename = typename std::enable_if<std::is_convertible<
+                typename std::iterator_traits<in_iter>::iterator_category,
+                std::input_iterator_tag>::value>::type>
+  void assign(in_iter in_start, in_iter in_end) {
+    clear();
+    append(in_start, in_end);
+  }
+
+  void assign(std::initializer_list<T> IL) {
+    clear();
+    append(IL);
+  }
+
+  iterator erase(const_iterator CI) {
+    // Just cast away constness because this is a non-const member function.
+    iterator I = const_cast<iterator>(CI);
+
+    assert(I >= this->begin() && "Iterator to erase is out of bounds.");
+    assert(I < this->end() && "Erasing at past-the-end iterator.");
+
+    iterator N = I;
+    // Shift all elts down one.
+    std::move(I+1, this->end(), I);
+    // Drop the last elt.
+    this->pop_back();
+    return(N);
+  }
+
+  iterator erase(const_iterator CS, const_iterator CE) {
+    // Just cast away constness because this is a non-const member function.
+    iterator S = const_cast<iterator>(CS);
+    iterator E = const_cast<iterator>(CE);
+
+    assert(S >= this->begin() && "Range to erase is out of bounds.");
+    assert(S <= E && "Trying to erase invalid range.");
+    assert(E <= this->end() && "Trying to erase past the end.");
+
+    iterator N = S;
+    // Shift all elts down.
+    iterator I = std::move(E, this->end(), S);
+    // Drop the last elts.
+    this->destroy_range(I, this->end());
+    this->setEnd(I);
+    return(N);
+  }
+
+  iterator insert(iterator I, T &&Elt) {
+    if (I == this->end()) {  // Important special case for empty vector.
+      this->push_back(::std::move(Elt));
+      return this->end()-1;
+    }
+
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    if (this->EndX >= this->CapacityX) {
+      size_t EltNo = I-this->begin();
+      this->grow();
+      I = this->begin()+EltNo;
+    }
+
+    ::new ((void*) this->end()) T(::std::move(this->back()));
+    // Push everything else over.
+    std::move_backward(I, this->end()-1, this->end());
+    this->setEnd(this->end()+1);
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference.
+    T *EltPtr = &Elt;
+    if (I <= EltPtr && EltPtr < this->EndX)
+      ++EltPtr;
+
+    *I = ::std::move(*EltPtr);
+    return I;
+  }
+
+  iterator insert(iterator I, const T &Elt) {
+    if (I == this->end()) {  // Important special case for empty vector.
+      this->push_back(Elt);
+      return this->end()-1;
+    }
+
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    if (this->EndX >= this->CapacityX) {
+      size_t EltNo = I-this->begin();
+      this->grow();
+      I = this->begin()+EltNo;
+    }
+    ::new ((void*) this->end()) T(std::move(this->back()));
+    // Push everything else over.
+    std::move_backward(I, this->end()-1, this->end());
+    this->setEnd(this->end()+1);
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference.
+    const T *EltPtr = &Elt;
+    if (I <= EltPtr && EltPtr < this->EndX)
+      ++EltPtr;
+
+    *I = *EltPtr;
+    return I;
+  }
+
+  iterator insert(iterator I, size_type NumToInsert, const T &Elt) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
+    if (I == this->end()) {  // Important special case for empty vector.
+      append(NumToInsert, Elt);
+      return this->begin()+InsertElt;
+    }
+
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    // Ensure there is enough space.
+    reserve(this->size() + NumToInsert);
+
+    // Uninvalidate the iterator.
+    I = this->begin()+InsertElt;
+
+    // If there are more elements between the insertion point and the end of the
+    // range than there are being inserted, we can use a simple approach to
+    // insertion.  Since we already reserved space, we know that this won't
+    // reallocate the vector.
+    if (size_t(this->end()-I) >= NumToInsert) {
+      T *OldEnd = this->end();
+      append(std::move_iterator<iterator>(this->end() - NumToInsert),
+             std::move_iterator<iterator>(this->end()));
+
+      // Copy the existing elements that get replaced.
+      std::move_backward(I, OldEnd-NumToInsert, OldEnd);
+
+      std::fill_n(I, NumToInsert, Elt);
+      return I;
+    }
+
+    // Otherwise, we're inserting more elements than exist already, and we're
+    // not inserting at the end.
+
+    // Move over the elements that we're about to overwrite.
+    T *OldEnd = this->end();
+    this->setEnd(this->end() + NumToInsert);
+    size_t NumOverwritten = OldEnd-I;
+    this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
+
+    // Replace the overwritten part.
+    std::fill_n(I, NumOverwritten, Elt);
+
+    // Insert the non-overwritten middle part.
+    std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt);
+    return I;
+  }
+
+  template <typename ItTy,
+            typename = typename std::enable_if<std::is_convertible<
+                typename std::iterator_traits<ItTy>::iterator_category,
+                std::input_iterator_tag>::value>::type>
+  iterator insert(iterator I, ItTy From, ItTy To) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
+    if (I == this->end()) {  // Important special case for empty vector.
+      append(From, To);
+      return this->begin()+InsertElt;
+    }
+
+    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    size_t NumToInsert = std::distance(From, To);
+
+    // Ensure there is enough space.
+    reserve(this->size() + NumToInsert);
+
+    // Uninvalidate the iterator.
+    I = this->begin()+InsertElt;
+
+    // If there are more elements between the insertion point and the end of the
+    // range than there are being inserted, we can use a simple approach to
+    // insertion.  Since we already reserved space, we know that this won't
+    // reallocate the vector.
+    if (size_t(this->end()-I) >= NumToInsert) {
+      T *OldEnd = this->end();
+      append(std::move_iterator<iterator>(this->end() - NumToInsert),
+             std::move_iterator<iterator>(this->end()));
+
+      // Copy the existing elements that get replaced.
+      std::move_backward(I, OldEnd-NumToInsert, OldEnd);
+
+      std::copy(From, To, I);
+      return I;
+    }
+
+    // Otherwise, we're inserting more elements than exist already, and we're
+    // not inserting at the end.
+
+    // Move over the elements that we're about to overwrite.
+    T *OldEnd = this->end();
+    this->setEnd(this->end() + NumToInsert);
+    size_t NumOverwritten = OldEnd-I;
+    this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
+
+    // Replace the overwritten part.
+    for (T *J = I; NumOverwritten > 0; --NumOverwritten) {
+      *J = *From;
+      ++J; ++From;
+    }
+
+    // Insert the non-overwritten middle part.
+    this->uninitialized_copy(From, To, OldEnd);
+    return I;
+  }
+
+  void insert(iterator I, std::initializer_list<T> IL) {
+    insert(I, IL.begin(), IL.end());
+  }
+
+  template <typename... ArgTypes> void emplace_back(ArgTypes &&... Args) {
+    if (this->EndX >= this->CapacityX)
+      this->grow();
+    ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
+    this->setEnd(this->end() + 1);
+  }
+
+  SmallVectorImpl &operator=(const SmallVectorImpl &RHS);
+
+  SmallVectorImpl &operator=(SmallVectorImpl &&RHS);
+
+  bool operator==(const SmallVectorImpl &RHS) const {
+    if (this->size() != RHS.size()) return false;
+    return std::equal(this->begin(), this->end(), RHS.begin());
+  }
+  bool operator!=(const SmallVectorImpl &RHS) const {
+    return !(*this == RHS);
+  }
+
+  bool operator<(const SmallVectorImpl &RHS) const {
+    return std::lexicographical_compare(this->begin(), this->end(),
+                                        RHS.begin(), RHS.end());
+  }
+
+  /// Set the array size to \p N, which the current array must have enough
+  /// capacity for.
+  ///
+  /// This does not construct or destroy any elements in the vector.
+  ///
+  /// Clients can use this in conjunction with capacity() to write past the end
+  /// of the buffer when they know that more elements are available, and only
+  /// update the size later. This avoids the cost of value initializing elements
+  /// which will only be overwritten.
+  void set_size(size_type N) {
+    assert(N <= this->capacity());
+    this->setEnd(this->begin() + N);
+  }
+};
+
+template <typename T>
+void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
+  if (this == &RHS) return;
+
+  // We can only avoid copying elements if neither vector is small.
+  if (!this->isSmall() && !RHS.isSmall()) {
+    std::swap(this->BeginX, RHS.BeginX);
+    std::swap(this->EndX, RHS.EndX);
+    std::swap(this->CapacityX, RHS.CapacityX);
+    return;
+  }
+  if (RHS.size() > this->capacity())
+    this->grow(RHS.size());
+  if (this->size() > RHS.capacity())
+    RHS.grow(this->size());
+
+  // Swap the shared elements.
+  size_t NumShared = this->size();
+  if (NumShared > RHS.size()) NumShared = RHS.size();
+  for (size_type i = 0; i != NumShared; ++i)
+    std::swap((*this)[i], RHS[i]);
+
+  // Copy over the extra elts.
+  if (this->size() > RHS.size()) {
+    size_t EltDiff = this->size() - RHS.size();
+    this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end());
+    RHS.setEnd(RHS.end()+EltDiff);
+    this->destroy_range(this->begin()+NumShared, this->end());
+    this->setEnd(this->begin()+NumShared);
+  } else if (RHS.size() > this->size()) {
+    size_t EltDiff = RHS.size() - this->size();
+    this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end());
+    this->setEnd(this->end() + EltDiff);
+    this->destroy_range(RHS.begin()+NumShared, RHS.end());
+    RHS.setEnd(RHS.begin()+NumShared);
+  }
+}
+
+template <typename T>
+SmallVectorImpl<T> &SmallVectorImpl<T>::
+  operator=(const SmallVectorImpl<T> &RHS) {
+  // Avoid self-assignment.
+  if (this == &RHS) return *this;
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd;
+    if (RHSSize)
+      NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin());
+    else
+      NewEnd = this->begin();
+
+    // Destroy excess elements.
+    this->destroy_range(NewEnd, this->end());
+
+    // Trim.
+    this->setEnd(NewEnd);
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: don't do this if they're efficiently moveable.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->destroy_range(this->begin(), this->end());
+    this->setEnd(this->begin());
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin());
+  }
+
+  // Copy construct the new elements in place.
+  this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(),
+                           this->begin()+CurSize);
+
+  // Set end.
+  this->setEnd(this->begin()+RHSSize);
+  return *this;
+}
+
+template <typename T>
+SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
+  // Avoid self-assignment.
+  if (this == &RHS) return *this;
+
+  // If the RHS isn't small, clear this vector and then steal its buffer.
+  if (!RHS.isSmall()) {
+    this->destroy_range(this->begin(), this->end());
+    if (!this->isSmall()) free(this->begin());
+    this->BeginX = RHS.BeginX;
+    this->EndX = RHS.EndX;
+    this->CapacityX = RHS.CapacityX;
+    RHS.resetToSmall();
+    return *this;
+  }
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd = this->begin();
+    if (RHSSize)
+      NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);
+
+    // Destroy excess elements and trim the bounds.
+    this->destroy_range(NewEnd, this->end());
+    this->setEnd(NewEnd);
+
+    // Clear the RHS.
+    RHS.clear();
+
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: this may not actually make any sense if we can efficiently move
+  // elements.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->destroy_range(this->begin(), this->end());
+    this->setEnd(this->begin());
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    std::move(RHS.begin(), RHS.begin()+CurSize, this->begin());
+  }
+
+  // Move-construct the new elements in place.
+  this->uninitialized_move(RHS.begin()+CurSize, RHS.end(),
+                           this->begin()+CurSize);
+
+  // Set end.
+  this->setEnd(this->begin()+RHSSize);
+
+  RHS.clear();
+  return *this;
+}
+
+/// Storage for the SmallVector elements which aren't contained in
+/// SmallVectorTemplateCommon. There are 'N-1' elements here. The remaining '1'
+/// element is in the base class. This is specialized for the N=1 and N=0 cases
+/// to avoid allocating unnecessary storage.
+template <typename T, unsigned N>
+struct SmallVectorStorage {
+  typename SmallVectorTemplateCommon<T>::U InlineElts[N - 1];
+};
+template <typename T> struct SmallVectorStorage<T, 1> {};
+template <typename T> struct SmallVectorStorage<T, 0> {};
+
+/// This is a 'vector' (really, a variable-sized array), optimized
+/// for the case when the array is small.  It contains some number of elements
+/// in-place, which allows it to avoid heap allocation when the actual number of
+/// elements is below that threshold.  This allows normal "small" cases to be
+/// fast without losing generality for large inputs.
+///
+/// Note that this does not attempt to be exception safe.
+///
+template <typename T, unsigned N>
+class SmallVector : public SmallVectorImpl<T> {
+  /// Inline space for elements which aren't stored in the base class.
+  SmallVectorStorage<T, N> Storage;
+
+public:
+  SmallVector() : SmallVectorImpl<T>(N) {}
+
+  explicit SmallVector(size_t Size, const T &Value = T())
+    : SmallVectorImpl<T>(N) {
+    this->assign(Size, Value);
+  }
+
+  template <typename ItTy,
+            typename = typename std::enable_if<std::is_convertible<
+                typename std::iterator_traits<ItTy>::iterator_category,
+                std::input_iterator_tag>::value>::type>
+  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
+    this->append(S, E);
+  }
+
+  template <typename Container>
+  explicit SmallVector(Container &&c) : SmallVectorImpl<T>(N) {
+    this->append(c.begin(), c.end());
+  }
+
+  SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
+    this->assign(IL);
+  }
+
+  SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(RHS);
+  }
+
+  const SmallVector &operator=(const SmallVector &RHS) {
+    SmallVectorImpl<T>::operator=(RHS);
+    return *this;
+  }
+
+  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  template<typename Container>
+  const SmallVector &operator=(const Container &RHS) {
+    this->assign(RHS.begin(), RHS.end());
+    return *this;
+  }
+
+  SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  const SmallVector &operator=(SmallVector &&RHS) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
+  const SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
+  template <typename Container>
+  const SmallVector &operator=(Container &&C) {
+    this->assign(C.begin(), C.end());
+    return *this;
+  }
+
+  const SmallVector &operator=(std::initializer_list<T> IL) {
+    this->assign(IL);
+    return *this;
+  }
+};
+
+template <typename T, unsigned N>
+inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
+  return X.capacity_in_bytes();
+}
+
+} // end namespace at
+
+namespace std {
+
+  /// Implement std::swap in terms of SmallVector swap.
+  template<typename T>
+  inline void
+  swap(at::SmallVectorImpl<T> &LHS, at::SmallVectorImpl<T> &RHS) {
+    LHS.swap(RHS);
+  }
+
+  /// Implement std::swap in terms of SmallVector swap.
+  template<typename T, unsigned N>
+  inline void
+  swap(at::SmallVector<T, N> &LHS, at::SmallVector<T, N> &RHS) {
+    LHS.swap(RHS);
+  }
+
+} // end namespace std
diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
index 03a5a6008e7d24..968fd8ebbec266 100644
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -18,14 +18,14 @@ namespace at {
 // tensor and a [0] size values tensor for such an empty tensor.  However,
 // we don't currently support zero-size dimensions, so we can't actually
 // do this; so we just allocate zero-size tensors for everything.
-SparseTensorImpl::SparseTensorImpl(at::Backend backend, at::ScalarType scalar_type)
-    : TensorImpl(backend, scalar_type, nullptr, false)
+SparseTensorImpl::SparseTensorImpl(Type * type)
+    : TensorImpl(type, nullptr)
     , size_{0}
     , sparseDims_(1)
     , denseDims_(0)
-    , indices_(globalContext().getTypeOpt(toDense(backend), ScalarType::Long)->tensor())
-    , values_(globalContext().getTypeOpt(toDense(backend), scalar_type)->tensor()) {
-      AT_ASSERT(backend == Backend::SparseCPU || backend == Backend::SparseCUDA);
+    , indices_(type->toDense().toScalarType(ScalarType::Long).tensor())
+    , values_(type->toDense().tensor()) {
+      AT_ASSERT(type->is_sparse());
     }
 
 IntList SparseTensorImpl::sizes() const {
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index 307c0f9e5574d1..9ef08705bb0f45 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -48,7 +48,7 @@ struct AT_API SparseTensorImpl : public TensorImpl {
 
 public:
   // Public for now...
-  explicit SparseTensorImpl(at::Backend, at::ScalarType);
+  explicit SparseTensorImpl(Type * type);
 
   int64_t nnz() const { return nnz_; }
   int64_t sparseDims() const { return sparseDims_; }
@@ -75,7 +75,7 @@ struct AT_API SparseTensorImpl : public TensorImpl {
     if (size.size() == 0) {
       size_ = {0};
     } else {
-      size_ = size.vec();
+      size_ = size;
     }
     sparseDims_ = sparseDims;
     denseDims_ = denseDims;
diff --git a/aten/src/ATen/Storage.cpp b/aten/src/ATen/Storage.cpp
index 991cfba92efd2a..f5ba512cc27105 100644
--- a/aten/src/ATen/Storage.cpp
+++ b/aten/src/ATen/Storage.cpp
@@ -1,32 +1,23 @@
 #include <ATen/Storage.h>
+#include <ATen/Context.h>
 #include <iostream>
 
 namespace at {
 
-Storage::Storage(at::ScalarType scalar_type, size_t size, Allocator* allocator)
-    : storage_impl_(new StorageImpl(
-          scalar_type,
-          size,
-          allocator,
-          /* resizable */ false)) {}
-
-Storage::Storage(
-    at::ScalarType scalar_type,
-    at::DataPtr data_ptr,
-    size_t size,
-    const std::function<void(void*)>& deleter)
-    : storage_impl_(new StorageImpl(
-          scalar_type,
-          size,
-          std::move(data_ptr),
-          /* allocator */ nullptr,
-          /* resizable */ false)) {}
-
 Storage::~Storage() {
   if (!storage_impl_) {
     return;
   }
-  storage_impl_->release();
+  if (--storage_impl_->refcount == 0) {
+    if (storage_impl_->finalizer) {
+      (*storage_impl_->finalizer)();
+    }
+    storage_impl_->finalizer = nullptr;
+    storage_impl_->data_ptr.clear();
+    if (storage_impl_ && --storage_impl_->weakcount == 0) {
+      delete storage_impl_;
+    }
+  }
 }
 
 } // namespace at
diff --git a/aten/src/ATen/Storage.h b/aten/src/ATen/Storage.h
index aa27296c74d40f..a5c85192e36f8c 100644
--- a/aten/src/ATen/Storage.h
+++ b/aten/src/ATen/Storage.h
@@ -8,12 +8,6 @@ struct AT_API Storage {
 public:
   Storage() = delete;
   Storage(StorageImpl* storage_impl) : storage_impl_(storage_impl) {}
-  Storage(at::ScalarType, size_t size, Allocator* allocator);
-  Storage(
-      at::ScalarType,
-      at::DataPtr,
-      size_t size,
-      const std::function<void(void*)>& deleter);
   ~Storage();
   // There are reasonable interpretations of these constructors, but they're to
   // be implemented on demand.
diff --git a/aten/src/ATen/StorageImpl.cpp b/aten/src/ATen/StorageImpl.cpp
index 6e3d693d012c5c..a26f8971310aa5 100644
--- a/aten/src/ATen/StorageImpl.cpp
+++ b/aten/src/ATen/StorageImpl.cpp
@@ -12,6 +12,8 @@ StorageImpl::StorageImpl(
     : scalar_type(scalar_type),
       data_ptr(std::move(data_ptr)),
       size(size),
+      refcount(1),
+      weakcount(1), // from the strong reference
       resizable(resizable),
       allocator(allocator),
       finalizer(nullptr) {}
diff --git a/aten/src/ATen/StorageImpl.h b/aten/src/ATen/StorageImpl.h
index f1c23c54677dba..c48ec51e013d4c 100644
--- a/aten/src/ATen/StorageImpl.h
+++ b/aten/src/ATen/StorageImpl.h
@@ -5,7 +5,6 @@
 #include <ATen/Allocator.h>
 #include <ATen/ScalarType.h>
 #include <ATen/ScalarTypeUtils.h>
-#include <ATen/Retainable.h>
 #include <TH/THTypeConversion.hpp>
 #include <atomic>
 
@@ -40,7 +39,7 @@ namespace at {
 
 struct Type;
 
-struct AT_API StorageImpl : public Retainable {
+struct TH_CPP_API StorageImpl {
 
   StorageImpl() = delete;
   virtual ~StorageImpl() {};
@@ -49,6 +48,8 @@ struct AT_API StorageImpl : public Retainable {
   at::ScalarType scalar_type;
   at::DataPtr data_ptr;
   ptrdiff_t size;
+  std::atomic<int> refcount;
+  std::atomic<int> weakcount;
   bool resizable;
   at::Allocator* allocator;
   std::unique_ptr<THFinalizer> finalizer;
@@ -57,8 +58,6 @@ struct AT_API StorageImpl : public Retainable {
   StorageImpl(StorageImpl&&) = delete;
   StorageImpl(const StorageImpl&&) = delete;
 
-  // TODO: Rename this into th_data, and move it out of the class;
-  // the real data shouldn't call th::from_type
   template <typename T>
   inline T* data() const {
     auto scalar_type_T = at::CTypeToScalarType<th::from_type<T>>::to();
@@ -77,14 +76,6 @@ struct AT_API StorageImpl : public Retainable {
     return static_cast<T*>(this->data_ptr.get());
   }
 
-  void release_resources() {
-    if (finalizer) {
-      (*finalizer)();
-    }
-    finalizer = nullptr;
-    data_ptr.clear();
-  }
-
   void operator=(const StorageImpl&) = delete;
 
   virtual size_t elementSize() const {
@@ -103,6 +94,9 @@ struct AT_API StorageImpl : public Retainable {
   const void* data() const {
     return data_ptr.get();
   };
+  void retain() {
+    ++refcount;
+  }
 
   int getDevice() const {
     return data_ptr.device().index();
diff --git a/aten/src/ATen/THLongStorageView.h b/aten/src/ATen/THLongStorageView.h
index 8ebcfdaeada40f..55e7d3de6dea4a 100644
--- a/aten/src/ATen/THLongStorageView.h
+++ b/aten/src/ATen/THLongStorageView.h
@@ -64,6 +64,7 @@ class THLongStorageView {
       storage.size = ref.size();
     }
     storage.scalar_type = at::CTypeToScalarType<th::from_type<int64_t>>::to();
+    storage.refcount = 0;
     storage.set_resizable(false);
   }
 private:
diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h
index 15f59e902182c4..60f6098762cd05 100644
--- a/aten/src/ATen/TensorGeometry.h
+++ b/aten/src/ATen/TensorGeometry.h
@@ -9,7 +9,7 @@ struct AT_API TensorGeometry {
   TensorGeometry() : storage_offset_(0) {}
 
   explicit TensorGeometry(IntList sizes)
-    : sizes_(sizes.vec())
+    : sizes_(sizes)
     , strides_(sizes.size())
     , storage_offset_(0) {
       int64_t dim = sizes.size();
@@ -21,8 +21,8 @@ struct AT_API TensorGeometry {
   }
 
   explicit TensorGeometry(const Tensor& t)
-    : sizes_(t.sizes().vec())
-    , strides_(t.strides().vec())
+    : sizes_(t.sizes())
+    , strides_(t.strides())
     , storage_offset_(t.storage_offset()) {}
 
   // true if the tensor is contiguous
diff --git a/aten/src/ATen/TensorImpl.cpp b/aten/src/ATen/TensorImpl.cpp
index a48cb033b2de49..59cc303a1acf5c 100644
--- a/aten/src/ATen/TensorImpl.cpp
+++ b/aten/src/ATen/TensorImpl.cpp
@@ -2,23 +2,10 @@
 
 #include <ATen/Tensor.h>
 #include <ATen/optional.h>
-#include <ATen/Context.h>
-
-#include <ATen/detail/VariableHooksInterface.h>
 
 #include <TH/THTensor.hpp>
 
 namespace at {
-
-Type& TensorImpl::type() const {
-  Type* base_type = &globalContext().getType(backend_, scalar_type_);
-  if (is_variable_) {
-    return detail::getVariableHooks().getVariableType(*base_type);
-  } else {
-    return *base_type;
-  }
-}
-
 Tensor& TensorImpl::grad() {
   AT_ERROR("grad is not implemented for Tensor");
 }
diff --git a/aten/src/ATen/TensorImpl.h b/aten/src/ATen/TensorImpl.h
index 1aa4d8390ed175..9c3591eb96b31f 100644
--- a/aten/src/ATen/TensorImpl.h
+++ b/aten/src/ATen/TensorImpl.h
@@ -18,18 +18,16 @@ struct Tensor;
 
 namespace at {
 struct AT_API TensorImpl : public Retainable {
-  explicit TensorImpl(Backend backend, ScalarType scalar_type, THTensor * tensor, bool is_variable)
-  : backend_(backend), scalar_type_(scalar_type), is_variable_(is_variable), tensor(tensor) {}
+  explicit TensorImpl(Type * type, THTensor * tensor)
+  : type_(type), tensor(tensor) {}
 
   virtual ~TensorImpl();
 
   virtual void release_resources() override;
 
-  // The implementation of this method will have to be hoisted out and
-  // hooked in, so that Caffe2 doesn't need to know about Context
-  // TODO: This really really needs to be inlined.
-  Type & type() const;
-
+  Type & type() const {
+    return *type_;
+  }
   const char * toString() const;
   virtual IntList sizes() const;
   virtual IntList strides() const;
@@ -93,12 +91,8 @@ struct AT_API TensorImpl : public Retainable {
   virtual void set_data(Tensor new_data);
 
 protected:
-  Backend backend_;
-  // INVARIANT: When storage is non-null, this scalar type must
-  // agree with the scalar type in storage
-  ScalarType scalar_type_;
-  bool is_variable_ = false;
   bool is_wrapped_number_ = false;
+  Type * type_;
 public:
   THTensor * tensor;
 };
diff --git a/aten/src/ATen/UndefinedTensor.cpp b/aten/src/ATen/UndefinedTensor.cpp
index ecfb70fa1bbede..5e4059421c1283 100644
--- a/aten/src/ATen/UndefinedTensor.cpp
+++ b/aten/src/ATen/UndefinedTensor.cpp
@@ -6,7 +6,7 @@ namespace at {
 
 // should this use the globalContext?  Can it get a context passed in somehow?
 UndefinedTensor::UndefinedTensor()
-: TensorImpl(Backend::Undefined, ScalarType::Undefined, nullptr, /* is variable */ false) {
+: TensorImpl(&(globalContext().getType(Backend::Undefined,ScalarType::Undefined)), nullptr) {
 }
 
 IntList UndefinedTensor::sizes() const {
diff --git a/aten/src/ATen/core/ATenCoreTest.cpp b/aten/src/ATen/core/ATenCoreTest.cpp
deleted file mode 100644
index 5bb595a0bce5de..00000000000000
--- a/aten/src/ATen/core/ATenCoreTest.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-#include <ATen/core/ATenCoreTest.h>
-
-namespace at {
-
-static int CoreTestGlobal = 0;
-int CoreTest() {
-  return CoreTestGlobal++;
-}
-
-} // namespace at
diff --git a/aten/src/ATen/core/ATenCoreTest.h b/aten/src/ATen/core/ATenCoreTest.h
deleted file mode 100644
index ee8471f66fe258..00000000000000
--- a/aten/src/ATen/core/ATenCoreTest.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#pragma once
-
-#include <ATen/core/CoreAPI.h>
-
-namespace at {
-
-AT_CORE_API int CoreTest();
-}
diff --git a/aten/src/ATen/core/ArrayRef.h b/aten/src/ATen/core/ArrayRef.h
deleted file mode 100644
index 7e997d6572f3c0..00000000000000
--- a/aten/src/ATen/core/ArrayRef.h
+++ /dev/null
@@ -1,212 +0,0 @@
-//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-// ATen: modified from llvm::ArrayRef.
-// removed llvm-specific functionality
-// removed some implicit const -> non-const conversions that rely on
-// complicated std::enable_if meta-programming
-// removed a bunch of slice variants for simplicity...
-
-#pragma once
-
-#include <ATen/core/C++17.h>
-#include <ATen/core/Error.h>
-#include <ATen/core/SmallVector.h>
-
-#include <array>
-#include <iterator>
-#include <vector>
-
-namespace at {
-
-/// ArrayRef - Represent a constant reference to an array (0 or more elements
-/// consecutively in memory), i.e. a start pointer and a length.  It allows
-/// various APIs to take consecutive elements easily and conveniently.
-///
-/// This class does not own the underlying data, it is expected to be used in
-/// situations where the data resides in some other buffer, whose lifetime
-/// extends past that of the ArrayRef. For this reason, it is not in general
-/// safe to store an ArrayRef.
-///
-/// This is intended to be trivially copyable, so it should be passed by
-/// value.
-template <typename T>
-class ArrayRef final {
- public:
-  using iterator = const T*;
-  using const_iterator = const T*;
-  using size_type = size_t;
-
-  using reverse_iterator = std::reverse_iterator<iterator>;
-
- private:
-  /// The start of the array, in an external buffer.
-  const T* Data;
-
-  /// The number of elements.
-  size_type Length;
-
- public:
-  /// @name Constructors
-  /// @{
-
-  /// Construct an empty ArrayRef.
-  /* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {}
-
-  /// Construct an ArrayRef from a single element.
-  // TODO Make this explicit
-  constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
-
-  /// Construct an ArrayRef from a pointer and length.
-  constexpr ArrayRef(const T* data, size_t length)
-      : Data(data), Length(length) {}
-
-  /// Construct an ArrayRef from a range.
-  constexpr ArrayRef(const T* begin, const T* end)
-      : Data(begin), Length(end - begin) {}
-
-  /// Construct an ArrayRef from a SmallVector. This is templated in order to
-  /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
-  /// copy-construct an ArrayRef.
-  template <typename U>
-  /* implicit */ ArrayRef(const SmallVectorTemplateCommon<T, U>& Vec)
-      : Data(Vec.data()), Length(Vec.size()) {}
-
-  /// Construct an ArrayRef from a std::vector.
-  template <typename A>
-  /* implicit */ ArrayRef(const std::vector<T, A>& Vec)
-      : Data(Vec.data()), Length(Vec.size()) {}
-
-  /// Construct an ArrayRef from a std::array
-  template <size_t N>
-  /* implicit */ constexpr ArrayRef(const std::array<T, N>& Arr)
-      : Data(Arr.data()), Length(N) {}
-
-  /// Construct an ArrayRef from a C array.
-  template <size_t N>
-  /* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
-
-  /// Construct an ArrayRef from a std::initializer_list.
-  /* implicit */ constexpr ArrayRef(const std::initializer_list<T>& Vec)
-      : Data(Vec.begin() == Vec.end() ? static_cast<T*>(nullptr) : Vec.begin()),
-        Length(Vec.size()) {}
-
-  /// @}
-  /// @name Simple Operations
-  /// @{
-
-  constexpr iterator begin() const {
-    return Data;
-  }
-  constexpr iterator end() const {
-    return Data + Length;
-  }
-
-  constexpr reverse_iterator rbegin() const {
-    return reverse_iterator(end());
-  }
-  constexpr reverse_iterator rend() const {
-    return reverse_iterator(begin());
-  }
-
-  /// empty - Check if the array is empty.
-  constexpr bool empty() const {
-    return Length == 0;
-  }
-
-  constexpr const T* data() const {
-    return Data;
-  }
-
-  /// size - Get the array size.
-  constexpr size_t size() const {
-    return Length;
-  }
-
-  /// front - Get the first element.
-  AT_CPP14_CONSTEXPR const T& front() const {
-    AT_CHECK(!empty(), "ArrayRef: attempted to access front() of empty list");
-    return Data[0];
-  }
-
-  /// back - Get the last element.
-  AT_CPP14_CONSTEXPR const T& back() const {
-    AT_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
-    return Data[Length - 1];
-  }
-
-  /// equals - Check for element-wise equality.
-  constexpr bool equals(ArrayRef RHS) const {
-    return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
-  }
-
-  /// slice(n, m) - Chop off the first N elements of the array, and keep M
-  /// elements in the array.
-  AT_CPP14_CONSTEXPR ArrayRef<T> slice(size_t N, size_t M) const {
-    AT_CHECK(
-        N + M <= size(),
-        "ArrayRef: invalid slice, N = ",
-        N,
-        "; M = ",
-        M,
-        "; size = ",
-        size());
-    return ArrayRef<T>(data() + N, M);
-  }
-
-  /// slice(n) - Chop off the first N elements of the array.
-  constexpr ArrayRef<T> slice(size_t N) const {
-    return slice(N, size() - N);
-  }
-
-  /// @}
-  /// @name Operator Overloads
-  /// @{
-  constexpr const T& operator[](size_t Index) const {
-    return Data[Index];
-  }
-
-  /// Vector compatibility
-  AT_CPP14_CONSTEXPR const T& at(size_t Index) const {
-    AT_CHECK(
-        Index < Length,
-        "ArrayRef: invalid index Index = ",
-        Index,
-        "; Length = ",
-        Length);
-    return Data[Index];
-  }
-
-  /// Disallow accidental assignment from a temporary.
-  ///
-  /// The declaration here is extra complicated so that "arrayRef = {}"
-  /// continues to select the move assignment operator.
-  template <typename U>
-  typename std::enable_if<std::is_same<U, T>::value, ArrayRef<T>>::type&
-  operator=(U&& Temporary) = delete;
-
-  /// Disallow accidental assignment from a temporary.
-  ///
-  /// The declaration here is extra complicated so that "arrayRef = {}"
-  /// continues to select the move assignment operator.
-  template <typename U>
-  typename std::enable_if<std::is_same<U, T>::value, ArrayRef<T>>::type&
-  operator=(std::initializer_list<U>) = delete;
-
-  /// @}
-  /// @name Expensive Operations
-  /// @{
-  std::vector<T> vec() const {
-    return std::vector<T>(Data, Data + Length);
-  }
-
-  /// @}
-};
-
-} // namespace at
diff --git a/aten/src/ATen/core/Backtrace.h b/aten/src/ATen/core/Backtrace.h
deleted file mode 100644
index ec4c17c6f6a531..00000000000000
--- a/aten/src/ATen/core/Backtrace.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-
-#include <cstddef>
-#include <string>
-#include <typeinfo>
-
-#include <ATen/core/CoreAPI.h>
-
-namespace at {
-/// Utility to demangle a C++ symbol name.
-AT_CORE_API std::string demangle(const char* name);
-
-/// Returns the printable name of the type.
-template <typename T>
-inline const char* demangle_type() {
-#ifdef __GXX_RTTI
-  static const std::string name = demangle(typeid(T).name());
-  return name.c_str();
-#else // __GXX_RTTI
-  return "(RTTI disabled, cannot show name)";
-#endif // __GXX_RTTI
-}
-
-AT_CORE_API std::string get_backtrace(
-    size_t frames_to_skip = 0,
-    size_t maximum_number_of_frames = 64,
-    bool skip_python_frames = true);
-} // namespace at
diff --git a/aten/src/ATen/core/C++17.cpp b/aten/src/ATen/core/C++17.cpp
deleted file mode 100644
index 6074cb6be15e9c..00000000000000
--- a/aten/src/ATen/core/C++17.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include <ATen/core/C++17.h>
diff --git a/aten/src/ATen/core/CMakeLists.txt b/aten/src/ATen/core/CMakeLists.txt
deleted file mode 100644
index 59149be784c3a6..00000000000000
--- a/aten/src/ATen/core/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# This file solely exists to let Caffe2 Android build get at the list
-# of core files without having to trundle through all of ATen's CMakeLists.txt
-
-FILE(GLOB ATen_CORE_HEADERS "*.h")
-FILE(GLOB ATen_CORE_SRCS "*.cpp")
-FILE(GLOB ATen_CORE_TEST_SRCS "*_test.cpp")
-EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})
-
-# Pass to parent
-set(ATen_CORE_HEADERS ${ATen_CORE_HEADERS} PARENT_SCOPE)
-set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE)
-set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
-# This is a little dodgy, because it means ALL ATen headers are made
-# visible.  Fortunately, you should just get a lot of undefined symbol
-# errors if you go outside core
-set(ATen_CORE_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../.. PARENT_SCOPE)
diff --git a/aten/src/ATen/core/CoreAPI.h b/aten/src/ATen/core/CoreAPI.h
deleted file mode 100644
index 0ee114d9f4cfdd..00000000000000
--- a/aten/src/ATen/core/CoreAPI.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// You can use the definition AT_CORE_STATIC_WINDOWS to control whether
-// or not we apply __declspec.  You will want to set this as
-// -DAT_CORE_STATIC_WINDOWS=1 when compiling code which links
-// against ATen/core on Windows, when ATen/core is built as a
-// static library (in which case, saying the symbol is coming
-// from a DLL would be incorrect).
-
-#ifdef _WIN32
-#if !defined(AT_CORE_STATIC_WINDOWS)
-#if defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
-#define AT_CORE_API __declspec(dllexport)
-#else
-#define AT_CORE_API __declspec(dllimport)
-#endif
-#else
-#define AT_CORE_API
-#endif
-#else
-#define AT_CORE_API
-#endif
diff --git a/aten/src/ATen/core/Error.h b/aten/src/ATen/core/Error.h
deleted file mode 100644
index b95a5f120f21b8..00000000000000
--- a/aten/src/ATen/core/Error.h
+++ /dev/null
@@ -1,147 +0,0 @@
-#pragma once
-
-#include <ATen/core/CoreAPI.h>
-#include <ATen/core/optional.h>
-
-#include <cstddef>
-#include <exception>
-#include <ostream>
-#include <sstream>
-#include <string>
-
-#if defined(_MSC_VER) && _MSC_VER <= 1900
-#define __func__ __FUNCTION__
-#endif
-
-namespace at {
-
-namespace detail {
-
-inline std::ostream& _str(std::ostream& ss) {
-  return ss;
-}
-
-template <typename T>
-inline std::ostream& _str(std::ostream& ss, const T& t) {
-  ss << t;
-  return ss;
-}
-
-template <typename T, typename... Args>
-inline std::ostream& _str(std::ostream& ss, const T& t, const Args&... args) {
-  return _str(_str(ss, t), args...);
-}
-
-} // namespace detail
-
-// Convert a list of string-like arguments into a single string.
-template <typename... Args>
-inline std::string str(const Args&... args) {
-  std::ostringstream ss;
-  detail::_str(ss, args...);
-  return ss.str();
-}
-
-// Specializations for already-a-string types.
-template <>
-inline std::string str(const std::string& str) {
-  return str;
-}
-inline std::string str(const char* c_str) {
-  return c_str;
-}
-
-/// Represents a location in source code (for debugging).
-struct SourceLocation {
-  const char* function;
-  const char* file;
-  uint32_t line;
-};
-
-std::ostream& operator<<(std::ostream& out, const SourceLocation& loc);
-
-/// The primary ATen error class.
-/// Provides a complete error message with source location information via
-/// `what()`, and a more concise message via `what_without_backtrace()`. Should
-/// primarily be used with the `AT_ERROR` macro.
-///
-/// NB: at::Error is handled specially by the default torch to suppress the
-/// backtrace, see torch/csrc/Exceptions.h
-class AT_CORE_API Error : public std::exception {
-  std::string what_without_backtrace_;
-  std::string what_;
-
- public:
-  Error(SourceLocation source_location, std::string err);
-
-  /// Returns the complete error message, including the source location.
-  const char* what() const noexcept override {
-    return what_.c_str();
-  }
-
-  /// Returns only the error message string, without source location.
-  const char* what_without_backtrace() const noexcept {
-    return what_without_backtrace_.c_str();
-  }
-};
-
-class AT_CORE_API Warning {
-  using handler_t =
-      void (*)(const SourceLocation& source_location, const char* msg);
-
- public:
-  /// Issue a warning with a given message. Dispatched to the current
-  /// warning handler.
-  static void warn(SourceLocation source_location, std::string msg);
-
-  /// Sets the global warning handler. This is not thread-safe, so it should
-  /// generally be called once during initialization.
-  static void set_warning_handler(handler_t handler);
-
-  /// The default warning handler. Prints the message to stderr.
-  static void print_warning(
-      const SourceLocation& source_location,
-      const char* msg);
-
- private:
-  static handler_t warning_handler_;
-};
-
-} // namespace at
-
-// TODO: variants that print the expression tested and thus don't require
-// strings
-// TODO: CAFFE_ENFORCE_WITH_CALLER style macro
-
-#define AT_ERROR(...) \
-  throw at::Error({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__))
-
-#define AT_WARN(...) \
-  at::Warning::warn({__func__, __FILE__, __LINE__}, at::str(__VA_ARGS__))
-
-#define AT_ASSERT(cond)                       \
-  if (!(cond)) {                              \
-    AT_ERROR(                                 \
-        #cond " ASSERT FAILED at ",           \
-        __FILE__,                             \
-        ":",                                  \
-        __LINE__,                             \
-        ", please report a bug to PyTorch."); \
-  }
-
-#define AT_ASSERTM(cond, ...)                 \
-  if (!(cond)) {                              \
-    AT_ERROR(at::str(                         \
-        #cond,                                \
-        " ASSERT FAILED at ",                 \
-        __FILE__,                             \
-        ":",                                  \
-        __LINE__,                             \
-        ", please report a bug to PyTorch. ", \
-        __VA_ARGS__));                        \
-  }
-
-#define AT_CHECK(cond, ...)         \
-  if (!(cond)) {                    \
-    AT_ERROR(at::str(__VA_ARGS__)); \
-  }
diff --git a/aten/src/ATen/core/Half-inl.h b/aten/src/ATen/core/Half-inl.h
deleted file mode 100644
index d89b496d7083b8..00000000000000
--- a/aten/src/ATen/core/Half-inl.h
+++ /dev/null
@@ -1,249 +0,0 @@
-#pragma once
-
-#include <cstring>
-#include <limits>
-#include <ATen/core/CoreAPI.h>
-
-#ifdef __CUDACC__
-#include <cuda_fp16.h>
-#endif
-
-#if defined(__HIP_DEVICE_COMPILE__)
-#include <hip/hip_fp16.h>
-#endif
-
-namespace at {
-
-/// Constructors
-
-inline AT_HOSTDEVICE Half::Half(float value) {
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  x = __half_as_short(__float2half(value));
-#else
-  x = detail::float2halfbits(value);
-#endif
-}
-
-/// Implicit conversions
-
-inline AT_HOSTDEVICE Half::operator float() const {
-#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  return __half2float(*reinterpret_cast<const __half*>(&x));
-#else
-  return detail::halfbits2float(x);
-#endif
-}
-
-#ifdef __CUDACC__
-inline AT_HOSTDEVICE Half::Half(const __half& value) {
-  x = *reinterpret_cast<const unsigned short*>(&value);
-}
-inline AT_HOSTDEVICE Half::operator __half() const {
-  return *reinterpret_cast<const __half*>(&x);
-}
-#endif
-
-/// Arithmetic
-
-inline AT_HOSTDEVICE Half operator+(const Half& a, const Half& b) {
-  return (float)a + (float)b;
-}
-
-inline AT_HOSTDEVICE Half operator-(const Half& a, const Half& b) {
-  return (float)a - (float)b;
-}
-
-inline AT_HOSTDEVICE Half operator*(const Half& a, const Half& b) {
-  return (float)a * (float)b;
-}
-
-inline AT_HOSTDEVICE Half operator/(const Half& a, const Half& b) {
-  return (float)a / (float)b;
-}
-
-inline AT_HOSTDEVICE Half operator-(const Half& a) {
-  return -(float)a;
-}
-
-inline AT_HOSTDEVICE Half& operator+=(Half& a, const Half& b) {
-  a = a + b;
-  return a;
-}
-
-inline AT_HOSTDEVICE Half& operator-=(Half& a, const Half& b) {
-  a = a - b;
-  return a;
-}
-
-inline AT_HOSTDEVICE Half& operator*=(Half& a, const Half& b) {
-  a = a * b;
-  return a;
-}
-
-inline AT_HOSTDEVICE Half& operator/=(Half& a, const Half& b) {
-  a = a / b;
-  return a;
-}
-
-/// Arithmetic with floats
-
-inline AT_HOSTDEVICE float operator+(Half a, float b) {
-  return (float)a + b;
-}
-inline AT_HOSTDEVICE float operator-(Half a, float b) {
-  return (float)a - b;
-}
-inline AT_HOSTDEVICE float operator*(Half a, float b) {
-  return (float)a * b;
-}
-inline AT_HOSTDEVICE float operator/(Half a, float b) {
-  return (float)a / b;
-}
-
-inline AT_HOSTDEVICE float operator+(float a, Half b) {
-  return a + (float)b;
-}
-inline AT_HOSTDEVICE float operator-(float a, Half b) {
-  return a - (float)b;
-}
-inline AT_HOSTDEVICE float operator*(float a, Half b) {
-  return a * (float)b;
-}
-inline AT_HOSTDEVICE float operator/(float a, Half b) {
-  return a / (float)b;
-}
-
-inline AT_HOSTDEVICE float& operator+=(float& a, const Half& b) {
-  return a += (float)b;
-}
-inline AT_HOSTDEVICE float& operator-=(float& a, const Half& b) {
-  return a -= (float)b;
-}
-inline AT_HOSTDEVICE float& operator*=(float& a, const Half& b) {
-  return a *= (float)b;
-}
-inline AT_HOSTDEVICE float& operator/=(float& a, const Half& b) {
-  return a /= (float)b;
-}
-
-/// Arithmetic with doubles
-
-inline AT_HOSTDEVICE double operator+(Half a, double b) {
-  return (double)a + b;
-}
-inline AT_HOSTDEVICE double operator-(Half a, double b) {
-  return (double)a - b;
-}
-inline AT_HOSTDEVICE double operator*(Half a, double b) {
-  return (double)a * b;
-}
-inline AT_HOSTDEVICE double operator/(Half a, double b) {
-  return (double)a / b;
-}
-
-inline AT_HOSTDEVICE double operator+(double a, Half b) {
-  return a + (double)b;
-}
-inline AT_HOSTDEVICE double operator-(double a, Half b) {
-  return a - (double)b;
-}
-inline AT_HOSTDEVICE double operator*(double a, Half b) {
-  return a * (double)b;
-}
-inline AT_HOSTDEVICE double operator/(double a, Half b) {
-  return a / (double)b;
-}
-
-/// Arithmetic with ints
-
-inline AT_HOSTDEVICE Half operator+(Half a, int b) {
-  return a + (Half)b;
-}
-inline AT_HOSTDEVICE Half operator-(Half a, int b) {
-  return a - (Half)b;
-}
-inline AT_HOSTDEVICE Half operator*(Half a, int b) {
-  return a * (Half)b;
-}
-inline AT_HOSTDEVICE Half operator/(Half a, int b) {
-  return a / (Half)b;
-}
-
-inline AT_HOSTDEVICE Half operator+(int a, Half b) {
-  return (Half)a + b;
-}
-inline AT_HOSTDEVICE Half operator-(int a, Half b) {
-  return (Half)a - b;
-}
-inline AT_HOSTDEVICE Half operator*(int a, Half b) {
-  return (Half)a * b;
-}
-inline AT_HOSTDEVICE Half operator/(int a, Half b) {
-  return (Half)a / b;
-}
-
-/// NOTE: we do not define comparisons directly and instead rely on the implicit
-/// conversion from at::Half to float.
-
-} // namespace at
-
-namespace std {
-
-template <>
-class numeric_limits<at::Half> {
- public:
-  static constexpr bool is_specialized = true;
-  static constexpr bool is_signed = true;
-  static constexpr bool is_integer = false;
-  static constexpr bool is_exact = false;
-  static constexpr bool has_infinity = true;
-  static constexpr bool has_quiet_NaN = true;
-  static constexpr bool has_signaling_NaN = true;
-  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
-  static constexpr auto has_denorm_loss =
-      numeric_limits<float>::has_denorm_loss;
-  static constexpr auto round_style = numeric_limits<float>::round_style;
-  static constexpr bool is_iec559 = true;
-  static constexpr bool is_bounded = true;
-  static constexpr bool is_modulo = false;
-  static constexpr int digits = 11;
-  static constexpr int digits10 = 3;
-  static constexpr int max_digits10 = 5;
-  static constexpr int radix = 2;
-  static constexpr int min_exponent = -13;
-  static constexpr int min_exponent10 = -4;
-  static constexpr int max_exponent = 16;
-  static constexpr int max_exponent10 = 4;
-  static constexpr auto traps = numeric_limits<float>::traps;
-  static constexpr auto tinyness_before =
-      numeric_limits<float>::tinyness_before;
-  static constexpr at::Half min() {
-    return at::Half(0x0400, at::Half::from_bits);
-  }
-  static constexpr at::Half lowest() {
-    return at::Half(0xFBFF, at::Half::from_bits);
-  }
-  static constexpr at::Half max() {
-    return at::Half(0x7BFF, at::Half::from_bits);
-  }
-  static constexpr at::Half epsilon() {
-    return at::Half(0x1400, at::Half::from_bits);
-  }
-  static constexpr at::Half round_error() {
-    return at::Half(0x3800, at::Half::from_bits);
-  }
-  static constexpr at::Half infinity() {
-    return at::Half(0x7C00, at::Half::from_bits);
-  }
-  static constexpr at::Half quiet_NaN() {
-    return at::Half(0x7E00, at::Half::from_bits);
-  }
-  static constexpr at::Half signaling_NaN() {
-    return at::Half(0x7D00, at::Half::from_bits);
-  }
-  static constexpr at::Half denorm_min() {
-    return at::Half(0x0001, at::Half::from_bits);
-  }
-};
-
-} // namespace std
diff --git a/aten/src/ATen/core/Half.cpp b/aten/src/ATen/core/Half.cpp
deleted file mode 100644
index e511f03a92bc73..00000000000000
--- a/aten/src/ATen/core/Half.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-#include <ATen/core/Half.h>
-
-#include <iostream>
-
-namespace at {
-
-static_assert(
-    std::is_standard_layout<Half>::value,
-    "at::Half must be standard layout.");
-
-namespace detail {
-
-// Host functions for converting between FP32 and FP16 formats
-
-float halfbits2float(unsigned short h) {
-  unsigned sign = ((h >> 15) & 1);
-  unsigned exponent = ((h >> 10) & 0x1f);
-  unsigned mantissa = ((h & 0x3ff) << 13);
-
-  if (exponent == 0x1f) { /* NaN or Inf */
-    mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
-    exponent = 0xff;
-  } else if (!exponent) { /* Denorm or Zero */
-    if (mantissa) {
-      unsigned int msb;
-      exponent = 0x71;
-      do {
-        msb = (mantissa & 0x400000);
-        mantissa <<= 1; /* normalize */
-        --exponent;
-      } while (!msb);
-      mantissa &= 0x7fffff; /* 1.mantissa is implicit */
-    }
-  } else {
-    exponent += 0x70;
-  }
-
-  unsigned result_bit = (sign << 31) | (exponent << 23) | mantissa;
-
-  // Reinterpret the result bit pattern as a float
-  float result_float;
-  std::memcpy(&result_float, &result_bit, sizeof(result_float));
-  return result_float;
-}
-
-unsigned short float2halfbits(float src) {
-  // Reinterpret the float as a bit pattern
-  unsigned x;
-  std::memcpy(&x, &src, sizeof(x));
-
-  unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
-  unsigned sign, exponent, mantissa;
-
-  // Get rid of +NaN/-NaN case first.
-  if (u > 0x7f800000) {
-    return 0x7fffU;
-  }
-
-  sign = ((x >> 16) & 0x8000);
-
-  // Get rid of +Inf/-Inf, +0/-0.
-  if (u > 0x477fefff) {
-    return sign | 0x7c00U;
-  }
-  if (u < 0x33000001) {
-    return (sign | 0x0000);
-  }
-
-  exponent = ((u >> 23) & 0xff);
-  mantissa = (u & 0x7fffff);
-
-  if (exponent > 0x70) {
-    shift = 13;
-    exponent -= 0x70;
-  } else {
-    shift = 0x7e - exponent;
-    exponent = 0;
-    mantissa |= 0x800000;
-  }
-  lsb = (1 << shift);
-  lsb_s1 = (lsb >> 1);
-  lsb_m1 = (lsb - 1);
-
-  // Round to nearest even.
-  remainder = (mantissa & lsb_m1);
-  mantissa >>= shift;
-  if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
-    ++mantissa;
-    if (!(mantissa & 0x3ff)) {
-      ++exponent;
-      mantissa = 0;
-    }
-  }
-
-  return (sign | (exponent << 10) | mantissa);
-}
-
-} // namespace detail
-
-std::ostream& operator<<(std::ostream& out, const Half& value) {
-  out << (float)value;
-  return out;
-}
-
-} // namespace at
diff --git a/aten/src/ATen/core/Half.h b/aten/src/ATen/core/Half.h
deleted file mode 100644
index 385f18e78cab02..00000000000000
--- a/aten/src/ATen/core/Half.h
+++ /dev/null
@@ -1,127 +0,0 @@
-#pragma once
-
-/// Defines the Half type (half-precision floating-point) including conversions
-/// to standard C types and basic arithmetic operations. Note that arithmetic
-/// operations are implemented by converting to floating point and
-/// performing the operation in float32, instead of using CUDA half intrinisics.
-/// Most uses of this type within ATen are memory bound, including the
-/// element-wise kernels, and the half intrinisics aren't efficient on all GPUs.
-/// If you are writing a compute bound kernel, you can use the CUDA half
-/// intrinsics directly on the Half type from device code.
-
-#include <ATen/core/CoreAPI.h>
-
-#include <cmath>
-#include <cstdint>
-#include <iosfwd>
-#include <limits>
-#include <stdexcept>
-#include <string>
-#include <utility>
-#include <sstream>
-
-#ifdef __CUDACC__
-#include <cuda_fp16.h>
-#endif
-
-#if defined(__HIP_DEVICE_COMPILE__)
-#include <hip/hip_fp16.h>
-#endif
-
-#ifndef AT_HOSTDEVICE
-#ifdef __CUDACC__
-#define AT_HOSTDEVICE __host__ __device__
-#else
-#define AT_HOSTDEVICE
-#endif
-#endif
-
-namespace at {
-
-namespace detail {
-
-AT_CORE_API float halfbits2float(unsigned short bits);
-AT_CORE_API unsigned short float2halfbits(float value);
-
-} // namespace detail
-
-struct alignas(2) Half {
-  unsigned short x;
-
-  struct from_bits_t {};
-  static constexpr from_bits_t from_bits = from_bits_t();
-
-  // HIP wants __host__ __device__ tag, CUDA does not
-#ifdef __HIP_PLATFORM_HCC__
-  AT_HOSTDEVICE Half() = default;
-#else
-  Half() = default;
-#endif
-
-  constexpr AT_HOSTDEVICE Half(unsigned short bits, from_bits_t) : x(bits){};
-  inline AT_HOSTDEVICE Half(float value);
-  inline AT_HOSTDEVICE operator float() const;
-
-#ifdef __CUDACC__
-  inline AT_HOSTDEVICE Half(const __half& value);
-  inline AT_HOSTDEVICE operator __half() const;
-#endif
-};
-
-template <typename To, typename From>
-To convert(From f) {
-  return static_cast<To>(f);
-}
-
-// skip isnan and isinf check for integral types
-template <typename To, typename From>
-typename std::enable_if<std::is_integral<From>::value, bool>::type overflows(
-    From f) {
-  using limit = std::numeric_limits<To>;
-  if (!limit::is_signed && std::numeric_limits<From>::is_signed) {
-    // allow for negative numbers to wrap using two's complement arithmetic.
-    // For example, with uint8, this allows for `a - b` to be treated as
-    // `a + 255 * b`.
-    return f > limit::max() || (f < 0 && -(uint64_t)f > limit::max());
-  } else {
-    return f < limit::lowest() || f > limit::max();
-  }
-}
-
-template <typename To, typename From>
-typename std::enable_if<!std::is_integral<From>::value, bool>::type overflows(
-    From f) {
-  using limit = std::numeric_limits<To>;
-  if (limit::has_infinity && std::isinf((double)f)) {
-    return false;
-  }
-  if (!limit::has_quiet_NaN && (f != f)) {
-    return true;
-  }
-  return f < limit::lowest() || f > limit::max();
-}
-
-template <typename To, typename From>
-To checked_convert(From f, const char* name) {
-  if (overflows<To, From>(f)) {
-    std::ostringstream oss;
-    oss << "value cannot be converted to type " << name << " without overflow: " << f;
-    throw std::domain_error(oss.str());
-  }
-  return convert<To, From>(f);
-}
-
-template <typename To, typename From>
-To HalfFix(From h) {
-  To ret;
-  ret.x = h.x;
-  return ret;
-}
-
-AT_CORE_API std::ostream& operator<<(std::ostream& out, const Half& value);
-
-} // namespace at
-
-#include "ATen/core/Half-inl.h"
-
-#undef AT_HOSTDEVICE
diff --git a/aten/src/ATen/core/IdWrapper.h b/aten/src/ATen/core/IdWrapper.h
deleted file mode 100644
index 7d152269d9a8c2..00000000000000
--- a/aten/src/ATen/core/IdWrapper.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#pragma once
-
-#include <functional>
-
-namespace at {
-
-/**
- * This template simplifies generation of simple classes that wrap an id
- * in a typesafe way. Namely, you can use it to create a very lightweight
- * type that only offers equality comparators and hashing. Example:
- *
- *   struct MyIdType final : IdWrapper<MyIdType, uint32_t> {
- *     constexpr explicit MyIdType(uint32_t id): IdWrapper(id) {}
- *   };
- *
- * Then in the global top level namespace:
- *
- *   AT_DEFINE_HASH_FOR_IDWRAPPER(MyIdType);
- *
- * That's it - equality operators and hash functions are automatically defined
- * for you, given the underlying type supports it.
- */
-template <class ConcreteType, class UnderlyingType>
-class IdWrapper {
- public:
-  using underlying_type = UnderlyingType;
-  using concrete_type = ConcreteType;
-
- protected:
-  constexpr explicit IdWrapper(underlying_type id) noexcept(
-      noexcept(underlying_type(std::declval<underlying_type>())))
-      : id_(id) {}
-
-  constexpr underlying_type underlyingId() const
-      noexcept(noexcept(underlying_type(std::declval<underlying_type>()))) {
-    return id_;
-  }
-
- private:
-  friend size_t hash_value(const concrete_type& v) {
-    return std::hash<underlying_type>()(v.id_);
-  }
-
-  // TODO Making operator== noexcept if underlying type is noexcept equality
-  // comparable doesn't work with GCC 4.8.
-  //      Fix this once we don't need GCC 4.8 anymore.
-  friend constexpr bool operator==(
-      const concrete_type& lhs,
-      const concrete_type& rhs) {
-    return lhs.id_ == rhs.id_;
-  }
-
-  // TODO Making operator!= noexcept if operator== is noexcept doesn't work with
-  // GCC 4.8.
-  //      Fix this once we don't need GCC 4.8 anymore.
-  friend constexpr bool operator!=(
-      const concrete_type& lhs,
-      const concrete_type& rhs) {
-    return !(lhs == rhs);
-  }
-
-  underlying_type id_;
-};
-
-} // namespace at
-
-#define AT_DEFINE_HASH_FOR_IDWRAPPER(ClassName) \
-  namespace std {                               \
-  template <>                                   \
-  struct hash<ClassName> {                      \
-    size_t operator()(ClassName x) const {      \
-      return hash_value(x);                     \
-    }                                           \
-  };                                            \
-  }
diff --git a/aten/src/ATen/core/README.md b/aten/src/ATen/core/README.md
deleted file mode 100644
index 71654f44e26f91..00000000000000
--- a/aten/src/ATen/core/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-ATen Core
----------
-
-ATen Core is a minimal subset of ATen which is suitable for deployment
-on mobile.  Binary size of files in this folder is an important constraint.
diff --git a/aten/src/ATen/core/SmallVector.h b/aten/src/ATen/core/SmallVector.h
deleted file mode 100644
index 269b21b0d5cf37..00000000000000
--- a/aten/src/ATen/core/SmallVector.h
+++ /dev/null
@@ -1,1034 +0,0 @@
-//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the SmallVector class.
-//
-//===----------------------------------------------------------------------===//
-
-// ATen: modified from llvm::SmallVector.
-// replaced report_bad_alloc_error with std::bad_alloc
-// replaced isPodLike<T> with AT_IS_TRIVIALLY_COPYABLE
-// replaced iterator_range constructor with inline Container&& constructor
-// removed LLVM_NODISCARD and LLVM_ATTRIBUTE_ALWAYS_INLINE qualifiers
-// removed LLVM_UNLIKELY
-
-#pragma once
-
-#include <ATen/core/AlignOf.h>
-#include <ATen/core/CoreAPI.h>
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdlib>
-#include <cstring>
-#include <initializer_list>
-#include <iterator>
-#include <memory>
-#include <new>
-#include <type_traits>
-#include <utility>
-
-#if __GNUG__ && __GNUC__ < 5
-#define AT_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
-#else
-#define AT_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
-#endif
-
-namespace at {
-
-namespace detail {
-
-// From llvm/Support/MathExtras.h
-static inline uint64_t NextPowerOf2(uint64_t A) {
-  A |= (A >> 1);
-  A |= (A >> 2);
-  A |= (A >> 4);
-  A |= (A >> 8);
-  A |= (A >> 16);
-  A |= (A >> 32);
-  return A + 1;
-}
-
-} // namespace detail
-
-/// This is all the non-templated stuff common to all SmallVectors.
-class AT_CORE_API SmallVectorBase {
- protected:
-  void *BeginX, *EndX, *CapacityX;
-
- protected:
-  SmallVectorBase(void* FirstEl, size_t Size)
-      : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl + Size) {}
-
-  /// This is an implementation of the grow() method which only works
-  /// on POD-like data types and is out of line to reduce code duplication.
-  void grow_pod(void* FirstEl, size_t MinSizeInBytes, size_t TSize);
-
- public:
-  /// This returns size()*sizeof(T).
-  size_t size_in_bytes() const {
-    return size_t((char*)EndX - (char*)BeginX);
-  }
-
-  /// capacity_in_bytes - This returns capacity()*sizeof(T).
-  size_t capacity_in_bytes() const {
-    return size_t((char*)CapacityX - (char*)BeginX);
-  }
-
-  bool empty() const {
-    return BeginX == EndX;
-  }
-};
-
-/// This is the part of SmallVectorTemplateBase which does not depend on whether
-/// the type T is a POD. The extra dummy template argument is used by ArrayRef
-/// to avoid unnecessarily requiring T to be complete.
-template <typename T, typename = void>
-class SmallVectorTemplateCommon : public SmallVectorBase {
- private:
-  template <typename, unsigned>
-  friend struct SmallVectorStorage;
-
-  // Allocate raw space for N elements of type T.  If T has a ctor or dtor, we
-  // don't want it to be automatically run, so we need to represent the space as
-  // something else.  Use an array of char of sufficient alignment.
-  using U = AlignedCharArrayUnion<T>;
-  U FirstEl;
-  // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
-
- protected:
-  SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {}
-
-  void grow_pod(size_t MinSizeInBytes, size_t TSize) {
-    SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize);
-  }
-
-  /// Return true if this is a smallvector which has not had dynamic
-  /// memory allocated for it.
-  bool isSmall() const {
-    return BeginX == static_cast<const void*>(&FirstEl);
-  }
-
-  /// Put this vector in a state of being small.
-  void resetToSmall() {
-    BeginX = EndX = CapacityX = &FirstEl;
-  }
-
-  void setEnd(T* P) {
-    this->EndX = P;
-  }
-
- public:
-  using size_type = size_t;
-  using difference_type = ptrdiff_t;
-  using value_type = T;
-  using iterator = T*;
-  using const_iterator = const T*;
-
-  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
-  using reverse_iterator = std::reverse_iterator<iterator>;
-
-  using reference = T&;
-  using const_reference = const T&;
-  using pointer = T*;
-  using const_pointer = const T*;
-
-  // forward iterator creation methods.
-  iterator begin() {
-    return (iterator)this->BeginX;
-  }
-  const_iterator begin() const {
-    return (const_iterator)this->BeginX;
-  }
-  iterator end() {
-    return (iterator)this->EndX;
-  }
-  const_iterator end() const {
-    return (const_iterator)this->EndX;
-  }
-
- protected:
-  iterator capacity_ptr() {
-    return (iterator)this->CapacityX;
-  }
-  const_iterator capacity_ptr() const {
-    return (const_iterator)this->CapacityX;
-  }
-
- public:
-  // reverse iterator creation methods.
-  reverse_iterator rbegin() {
-    return reverse_iterator(end());
-  }
-  const_reverse_iterator rbegin() const {
-    return const_reverse_iterator(end());
-  }
-  reverse_iterator rend() {
-    return reverse_iterator(begin());
-  }
-  const_reverse_iterator rend() const {
-    return const_reverse_iterator(begin());
-  }
-
-  size_type size() const {
-    return end() - begin();
-  }
-  size_type max_size() const {
-    return size_type(-1) / sizeof(T);
-  }
-
-  /// Return the total number of elements in the currently allocated buffer.
-  size_t capacity() const {
-    return capacity_ptr() - begin();
-  }
-
-  /// Return a pointer to the vector's buffer, even if empty().
-  pointer data() {
-    return pointer(begin());
-  }
-  /// Return a pointer to the vector's buffer, even if empty().
-  const_pointer data() const {
-    return const_pointer(begin());
-  }
-
-  reference operator[](size_type idx) {
-    assert(idx < size());
-    return begin()[idx];
-  }
-  const_reference operator[](size_type idx) const {
-    assert(idx < size());
-    return begin()[idx];
-  }
-
-  reference front() {
-    assert(!empty());
-    return begin()[0];
-  }
-  const_reference front() const {
-    assert(!empty());
-    return begin()[0];
-  }
-
-  reference back() {
-    assert(!empty());
-    return end()[-1];
-  }
-  const_reference back() const {
-    assert(!empty());
-    return end()[-1];
-  }
-};
-
-/// SmallVectorTemplateBase<isPodLike = false> - This is where we put method
-/// implementations that are designed to work with non-POD-like T's.
-template <typename T, bool isPodLike>
-class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
- protected:
-  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
-
-  static void destroy_range(T* S, T* E) {
-    while (S != E) {
-      --E;
-      E->~T();
-    }
-  }
-
-  /// Move the range [I, E) into the uninitialized memory starting with "Dest",
-  /// constructing elements as needed.
-  template <typename It1, typename It2>
-  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
-    std::uninitialized_copy(
-        std::make_move_iterator(I), std::make_move_iterator(E), Dest);
-  }
-
-  /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
-  /// constructing elements as needed.
-  template <typename It1, typename It2>
-  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
-    std::uninitialized_copy(I, E, Dest);
-  }
-
-  /// Grow the allocated memory (without initializing new elements), doubling
-  /// the size of the allocated memory. Guarantees space for at least one more
-  /// element, or MinSize more elements if specified.
-  void grow(size_t MinSize = 0);
-
- public:
-  void push_back(const T& Elt) {
-    if (this->EndX >= this->CapacityX)
-      this->grow();
-    ::new ((void*)this->end()) T(Elt);
-    this->setEnd(this->end() + 1);
-  }
-
-  void push_back(T&& Elt) {
-    if (this->EndX >= this->CapacityX)
-      this->grow();
-    ::new ((void*)this->end()) T(::std::move(Elt));
-    this->setEnd(this->end() + 1);
-  }
-
-  void pop_back() {
-    this->setEnd(this->end() - 1);
-    this->end()->~T();
-  }
-};
-
-// Define this out-of-line to dissuade the C++ compiler from inlining it.
-template <typename T, bool isPodLike>
-void SmallVectorTemplateBase<T, isPodLike>::grow(size_t MinSize) {
-  size_t CurCapacity = this->capacity();
-  size_t CurSize = this->size();
-  // Always grow, even from zero.
-  size_t NewCapacity = size_t(detail::NextPowerOf2(CurCapacity + 2));
-  if (NewCapacity < MinSize)
-    NewCapacity = MinSize;
-  T* NewElts = static_cast<T*>(malloc(NewCapacity * sizeof(T)));
-  if (NewElts == nullptr)
-    throw std::bad_alloc();
-
-  // Move the elements over.
-  this->uninitialized_move(this->begin(), this->end(), NewElts);
-
-  // Destroy the original elements.
-  destroy_range(this->begin(), this->end());
-
-  // If this wasn't grown from the inline copy, deallocate the old space.
-  if (!this->isSmall())
-    free(this->begin());
-
-  this->setEnd(NewElts + CurSize);
-  this->BeginX = NewElts;
-  this->CapacityX = this->begin() + NewCapacity;
-}
-
-/// SmallVectorTemplateBase<isPodLike = true> - This is where we put method
-/// implementations that are designed to work with POD-like T's.
-template <typename T>
-class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
- protected:
-  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
-
-  // No need to do a destroy loop for POD's.
-  static void destroy_range(T*, T*) {}
-
-  /// Move the range [I, E) onto the uninitialized memory
-  /// starting with "Dest", constructing elements into it as needed.
-  template <typename It1, typename It2>
-  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
-    // Just do a copy.
-    uninitialized_copy(I, E, Dest);
-  }
-
-  /// Copy the range [I, E) onto the uninitialized memory
-  /// starting with "Dest", constructing elements into it as needed.
-  template <typename It1, typename It2>
-  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
-    // Arbitrary iterator types; just use the basic implementation.
-    std::uninitialized_copy(I, E, Dest);
-  }
-
-  /// Copy the range [I, E) onto the uninitialized memory
-  /// starting with "Dest", constructing elements into it as needed.
-  template <typename T1, typename T2>
-  static void uninitialized_copy(
-      T1* I,
-      T1* E,
-      T2* Dest,
-      typename std::enable_if<
-          std::is_same<typename std::remove_const<T1>::type, T2>::value>::
-          type* = nullptr) {
-    // Use memcpy for PODs iterated by pointers (which includes SmallVector
-    // iterators): std::uninitialized_copy optimizes to memmove, but we can
-    // use memcpy here. Note that I and E are iterators and thus might be
-    // invalid for memcpy if they are equal.
-    if (I != E)
-      memcpy(Dest, I, (E - I) * sizeof(T));
-  }
-
-  /// Double the size of the allocated memory, guaranteeing space for at
-  /// least one more element or MinSize if specified.
-  void grow(size_t MinSize = 0) {
-    this->grow_pod(MinSize * sizeof(T), sizeof(T));
-  }
-
- public:
-  void push_back(const T& Elt) {
-    if (this->EndX >= this->CapacityX)
-      this->grow();
-    memcpy(this->end(), &Elt, sizeof(T));
-    this->setEnd(this->end() + 1);
-  }
-
-  void pop_back() {
-    this->setEnd(this->end() - 1);
-  }
-};
-
-/// This class consists of common code factored out of the SmallVector class to
-/// reduce code duplication based on the SmallVector 'N' template parameter.
-template <typename T>
-class SmallVectorImpl
-    : public SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)> {
-  using SuperClass = SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)>;
-
- public:
-  using iterator = typename SuperClass::iterator;
-  using const_iterator = typename SuperClass::const_iterator;
-  using size_type = typename SuperClass::size_type;
-
- protected:
-  // Default ctor - Initialize to empty.
-  explicit SmallVectorImpl(unsigned N)
-      : SmallVectorTemplateBase<T, AT_IS_TRIVIALLY_COPYABLE(T)>(N * sizeof(T)) {
-  }
-
- public:
-  SmallVectorImpl(const SmallVectorImpl&) = delete;
-
-  ~SmallVectorImpl() {
-    // Destroy the constructed elements in the vector.
-    this->destroy_range(this->begin(), this->end());
-
-    // If this wasn't grown from the inline copy, deallocate the old space.
-    if (!this->isSmall())
-      free(this->begin());
-  }
-
-  void clear() {
-    this->destroy_range(this->begin(), this->end());
-    this->EndX = this->BeginX;
-  }
-
-  void resize(size_type N) {
-    if (N < this->size()) {
-      this->destroy_range(this->begin() + N, this->end());
-      this->setEnd(this->begin() + N);
-    } else if (N > this->size()) {
-      if (this->capacity() < N)
-        this->grow(N);
-      auto I = this->end();
-      for (auto E = this->begin() + N; I != E; ++I)
-        new (&*I) T();
-      this->setEnd(this->begin() + N);
-    }
-  }
-
-  void resize(size_type N, const T& NV) {
-    if (N < this->size()) {
-      this->destroy_range(this->begin() + N, this->end());
-      this->setEnd(this->begin() + N);
-    } else if (N > this->size()) {
-      if (this->capacity() < N)
-        this->grow(N);
-      std::uninitialized_fill(this->end(), this->begin() + N, NV);
-      this->setEnd(this->begin() + N);
-    }
-  }
-
-  void reserve(size_type N) {
-    if (this->capacity() < N)
-      this->grow(N);
-  }
-
-  T pop_back_val() {
-    T Result = ::std::move(this->back());
-    this->pop_back();
-    return Result;
-  }
-
-  void swap(SmallVectorImpl& RHS);
-
-  /// Add the specified range to the end of the SmallVector.
-  template <
-      typename in_iter,
-      typename = typename std::enable_if<std::is_convertible<
-          typename std::iterator_traits<in_iter>::iterator_category,
-          std::input_iterator_tag>::value>::type>
-  void append(in_iter in_start, in_iter in_end) {
-    size_type NumInputs = std::distance(in_start, in_end);
-    // Grow allocated space if needed.
-    if (NumInputs > size_type(this->capacity_ptr() - this->end()))
-      this->grow(this->size() + NumInputs);
-
-    // Copy the new elements over.
-    this->uninitialized_copy(in_start, in_end, this->end());
-    this->setEnd(this->end() + NumInputs);
-  }
-
-  /// Add the specified range to the end of the SmallVector.
-  void append(size_type NumInputs, const T& Elt) {
-    // Grow allocated space if needed.
-    if (NumInputs > size_type(this->capacity_ptr() - this->end()))
-      this->grow(this->size() + NumInputs);
-
-    // Copy the new elements over.
-    std::uninitialized_fill_n(this->end(), NumInputs, Elt);
-    this->setEnd(this->end() + NumInputs);
-  }
-
-  void append(std::initializer_list<T> IL) {
-    append(IL.begin(), IL.end());
-  }
-
-  // FIXME: Consider assigning over existing elements, rather than clearing &
-  // re-initializing them - for all assign(...) variants.
-
-  void assign(size_type NumElts, const T& Elt) {
-    clear();
-    if (this->capacity() < NumElts)
-      this->grow(NumElts);
-    this->setEnd(this->begin() + NumElts);
-    std::uninitialized_fill(this->begin(), this->end(), Elt);
-  }
-
-  template <
-      typename in_iter,
-      typename = typename std::enable_if<std::is_convertible<
-          typename std::iterator_traits<in_iter>::iterator_category,
-          std::input_iterator_tag>::value>::type>
-  void assign(in_iter in_start, in_iter in_end) {
-    clear();
-    append(in_start, in_end);
-  }
-
-  void assign(std::initializer_list<T> IL) {
-    clear();
-    append(IL);
-  }
-
-  iterator erase(const_iterator CI) {
-    // Just cast away constness because this is a non-const member function.
-    iterator I = const_cast<iterator>(CI);
-
-    assert(I >= this->begin() && "Iterator to erase is out of bounds.");
-    assert(I < this->end() && "Erasing at past-the-end iterator.");
-
-    iterator N = I;
-    // Shift all elts down one.
-    std::move(I + 1, this->end(), I);
-    // Drop the last elt.
-    this->pop_back();
-    return (N);
-  }
-
-  iterator erase(const_iterator CS, const_iterator CE) {
-    // Just cast away constness because this is a non-const member function.
-    iterator S = const_cast<iterator>(CS);
-    iterator E = const_cast<iterator>(CE);
-
-    assert(S >= this->begin() && "Range to erase is out of bounds.");
-    assert(S <= E && "Trying to erase invalid range.");
-    assert(E <= this->end() && "Trying to erase past the end.");
-
-    iterator N = S;
-    // Shift all elts down.
-    iterator I = std::move(E, this->end(), S);
-    // Drop the last elts.
-    this->destroy_range(I, this->end());
-    this->setEnd(I);
-    return (N);
-  }
-
-  iterator insert(iterator I, T&& Elt) {
-    if (I == this->end()) { // Important special case for empty vector.
-      this->push_back(::std::move(Elt));
-      return this->end() - 1;
-    }
-
-    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
-    assert(I <= this->end() && "Inserting past the end of the vector.");
-
-    if (this->EndX >= this->CapacityX) {
-      size_t EltNo = I - this->begin();
-      this->grow();
-      I = this->begin() + EltNo;
-    }
-
-    ::new ((void*)this->end()) T(::std::move(this->back()));
-    // Push everything else over.
-    std::move_backward(I, this->end() - 1, this->end());
-    this->setEnd(this->end() + 1);
-
-    // If we just moved the element we're inserting, be sure to update
-    // the reference.
-    T* EltPtr = &Elt;
-    if (I <= EltPtr && EltPtr < this->EndX)
-      ++EltPtr;
-
-    *I = ::std::move(*EltPtr);
-    return I;
-  }
-
-  iterator insert(iterator I, const T& Elt) {
-    if (I == this->end()) { // Important special case for empty vector.
-      this->push_back(Elt);
-      return this->end() - 1;
-    }
-
-    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
-    assert(I <= this->end() && "Inserting past the end of the vector.");
-
-    if (this->EndX >= this->CapacityX) {
-      size_t EltNo = I - this->begin();
-      this->grow();
-      I = this->begin() + EltNo;
-    }
-    ::new ((void*)this->end()) T(std::move(this->back()));
-    // Push everything else over.
-    std::move_backward(I, this->end() - 1, this->end());
-    this->setEnd(this->end() + 1);
-
-    // If we just moved the element we're inserting, be sure to update
-    // the reference.
-    const T* EltPtr = &Elt;
-    if (I <= EltPtr && EltPtr < this->EndX)
-      ++EltPtr;
-
-    *I = *EltPtr;
-    return I;
-  }
-
-  iterator insert(iterator I, size_type NumToInsert, const T& Elt) {
-    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
-    size_t InsertElt = I - this->begin();
-
-    if (I == this->end()) { // Important special case for empty vector.
-      append(NumToInsert, Elt);
-      return this->begin() + InsertElt;
-    }
-
-    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
-    assert(I <= this->end() && "Inserting past the end of the vector.");
-
-    // Ensure there is enough space.
-    reserve(this->size() + NumToInsert);
-
-    // Uninvalidate the iterator.
-    I = this->begin() + InsertElt;
-
-    // If there are more elements between the insertion point and the end of the
-    // range than there are being inserted, we can use a simple approach to
-    // insertion.  Since we already reserved space, we know that this won't
-    // reallocate the vector.
-    if (size_t(this->end() - I) >= NumToInsert) {
-      T* OldEnd = this->end();
-      append(
-          std::move_iterator<iterator>(this->end() - NumToInsert),
-          std::move_iterator<iterator>(this->end()));
-
-      // Copy the existing elements that get replaced.
-      std::move_backward(I, OldEnd - NumToInsert, OldEnd);
-
-      std::fill_n(I, NumToInsert, Elt);
-      return I;
-    }
-
-    // Otherwise, we're inserting more elements than exist already, and we're
-    // not inserting at the end.
-
-    // Move over the elements that we're about to overwrite.
-    T* OldEnd = this->end();
-    this->setEnd(this->end() + NumToInsert);
-    size_t NumOverwritten = OldEnd - I;
-    this->uninitialized_move(I, OldEnd, this->end() - NumOverwritten);
-
-    // Replace the overwritten part.
-    std::fill_n(I, NumOverwritten, Elt);
-
-    // Insert the non-overwritten middle part.
-    std::uninitialized_fill_n(OldEnd, NumToInsert - NumOverwritten, Elt);
-    return I;
-  }
-
-  template <
-      typename ItTy,
-      typename = typename std::enable_if<std::is_convertible<
-          typename std::iterator_traits<ItTy>::iterator_category,
-          std::input_iterator_tag>::value>::type>
-  iterator insert(iterator I, ItTy From, ItTy To) {
-    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
-    size_t InsertElt = I - this->begin();
-
-    if (I == this->end()) { // Important special case for empty vector.
-      append(From, To);
-      return this->begin() + InsertElt;
-    }
-
-    assert(I >= this->begin() && "Insertion iterator is out of bounds.");
-    assert(I <= this->end() && "Inserting past the end of the vector.");
-
-    size_t NumToInsert = std::distance(From, To);
-
-    // Ensure there is enough space.
-    reserve(this->size() + NumToInsert);
-
-    // Uninvalidate the iterator.
-    I = this->begin() + InsertElt;
-
-    // If there are more elements between the insertion point and the end of the
-    // range than there are being inserted, we can use a simple approach to
-    // insertion.  Since we already reserved space, we know that this won't
-    // reallocate the vector.
-    if (size_t(this->end() - I) >= NumToInsert) {
-      T* OldEnd = this->end();
-      append(
-          std::move_iterator<iterator>(this->end() - NumToInsert),
-          std::move_iterator<iterator>(this->end()));
-
-      // Copy the existing elements that get replaced.
-      std::move_backward(I, OldEnd - NumToInsert, OldEnd);
-
-      std::copy(From, To, I);
-      return I;
-    }
-
-    // Otherwise, we're inserting more elements than exist already, and we're
-    // not inserting at the end.
-
-    // Move over the elements that we're about to overwrite.
-    T* OldEnd = this->end();
-    this->setEnd(this->end() + NumToInsert);
-    size_t NumOverwritten = OldEnd - I;
-    this->uninitialized_move(I, OldEnd, this->end() - NumOverwritten);
-
-    // Replace the overwritten part.
-    for (T* J = I; NumOverwritten > 0; --NumOverwritten) {
-      *J = *From;
-      ++J;
-      ++From;
-    }
-
-    // Insert the non-overwritten middle part.
-    this->uninitialized_copy(From, To, OldEnd);
-    return I;
-  }
-
-  void insert(iterator I, std::initializer_list<T> IL) {
-    insert(I, IL.begin(), IL.end());
-  }
-
-  template <typename... ArgTypes>
-  void emplace_back(ArgTypes&&... Args) {
-    if (this->EndX >= this->CapacityX)
-      this->grow();
-    ::new ((void*)this->end()) T(std::forward<ArgTypes>(Args)...);
-    this->setEnd(this->end() + 1);
-  }
-
-  SmallVectorImpl& operator=(const SmallVectorImpl& RHS);
-
-  SmallVectorImpl& operator=(SmallVectorImpl&& RHS);
-
-  bool operator==(const SmallVectorImpl& RHS) const {
-    if (this->size() != RHS.size())
-      return false;
-    return std::equal(this->begin(), this->end(), RHS.begin());
-  }
-  bool operator!=(const SmallVectorImpl& RHS) const {
-    return !(*this == RHS);
-  }
-
-  bool operator<(const SmallVectorImpl& RHS) const {
-    return std::lexicographical_compare(
-        this->begin(), this->end(), RHS.begin(), RHS.end());
-  }
-
-  /// Set the array size to \p N, which the current array must have enough
-  /// capacity for.
-  ///
-  /// This does not construct or destroy any elements in the vector.
-  ///
-  /// Clients can use this in conjunction with capacity() to write past the end
-  /// of the buffer when they know that more elements are available, and only
-  /// update the size later. This avoids the cost of value initializing elements
-  /// which will only be overwritten.
-  void set_size(size_type N) {
-    assert(N <= this->capacity());
-    this->setEnd(this->begin() + N);
-  }
-};
-
-template <typename T>
-void SmallVectorImpl<T>::swap(SmallVectorImpl<T>& RHS) {
-  if (this == &RHS)
-    return;
-
-  // We can only avoid copying elements if neither vector is small.
-  if (!this->isSmall() && !RHS.isSmall()) {
-    std::swap(this->BeginX, RHS.BeginX);
-    std::swap(this->EndX, RHS.EndX);
-    std::swap(this->CapacityX, RHS.CapacityX);
-    return;
-  }
-  if (RHS.size() > this->capacity())
-    this->grow(RHS.size());
-  if (this->size() > RHS.capacity())
-    RHS.grow(this->size());
-
-  // Swap the shared elements.
-  size_t NumShared = this->size();
-  if (NumShared > RHS.size())
-    NumShared = RHS.size();
-  for (size_type i = 0; i != NumShared; ++i)
-    std::swap((*this)[i], RHS[i]);
-
-  // Copy over the extra elts.
-  if (this->size() > RHS.size()) {
-    size_t EltDiff = this->size() - RHS.size();
-    this->uninitialized_copy(this->begin() + NumShared, this->end(), RHS.end());
-    RHS.setEnd(RHS.end() + EltDiff);
-    this->destroy_range(this->begin() + NumShared, this->end());
-    this->setEnd(this->begin() + NumShared);
-  } else if (RHS.size() > this->size()) {
-    size_t EltDiff = RHS.size() - this->size();
-    this->uninitialized_copy(RHS.begin() + NumShared, RHS.end(), this->end());
-    this->setEnd(this->end() + EltDiff);
-    this->destroy_range(RHS.begin() + NumShared, RHS.end());
-    RHS.setEnd(RHS.begin() + NumShared);
-  }
-}
-
-template <typename T>
-SmallVectorImpl<T>& SmallVectorImpl<T>::operator=(
-    const SmallVectorImpl<T>& RHS) {
-  // Avoid self-assignment.
-  if (this == &RHS)
-    return *this;
-
-  // If we already have sufficient space, assign the common elements, then
-  // destroy any excess.
-  size_t RHSSize = RHS.size();
-  size_t CurSize = this->size();
-  if (CurSize >= RHSSize) {
-    // Assign common elements.
-    iterator NewEnd;
-    if (RHSSize)
-      NewEnd = std::copy(RHS.begin(), RHS.begin() + RHSSize, this->begin());
-    else
-      NewEnd = this->begin();
-
-    // Destroy excess elements.
-    this->destroy_range(NewEnd, this->end());
-
-    // Trim.
-    this->setEnd(NewEnd);
-    return *this;
-  }
-
-  // If we have to grow to have enough elements, destroy the current elements.
-  // This allows us to avoid copying them during the grow.
-  // FIXME: don't do this if they're efficiently moveable.
-  if (this->capacity() < RHSSize) {
-    // Destroy current elements.
-    this->destroy_range(this->begin(), this->end());
-    this->setEnd(this->begin());
-    CurSize = 0;
-    this->grow(RHSSize);
-  } else if (CurSize) {
-    // Otherwise, use assignment for the already-constructed elements.
-    std::copy(RHS.begin(), RHS.begin() + CurSize, this->begin());
-  }
-
-  // Copy construct the new elements in place.
-  this->uninitialized_copy(
-      RHS.begin() + CurSize, RHS.end(), this->begin() + CurSize);
-
-  // Set end.
-  this->setEnd(this->begin() + RHSSize);
-  return *this;
-}
-
-template <typename T>
-SmallVectorImpl<T>& SmallVectorImpl<T>::operator=(SmallVectorImpl<T>&& RHS) {
-  // Avoid self-assignment.
-  if (this == &RHS)
-    return *this;
-
-  // If the RHS isn't small, clear this vector and then steal its buffer.
-  if (!RHS.isSmall()) {
-    this->destroy_range(this->begin(), this->end());
-    if (!this->isSmall())
-      free(this->begin());
-    this->BeginX = RHS.BeginX;
-    this->EndX = RHS.EndX;
-    this->CapacityX = RHS.CapacityX;
-    RHS.resetToSmall();
-    return *this;
-  }
-
-  // If we already have sufficient space, assign the common elements, then
-  // destroy any excess.
-  size_t RHSSize = RHS.size();
-  size_t CurSize = this->size();
-  if (CurSize >= RHSSize) {
-    // Assign common elements.
-    iterator NewEnd = this->begin();
-    if (RHSSize)
-      NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);
-
-    // Destroy excess elements and trim the bounds.
-    this->destroy_range(NewEnd, this->end());
-    this->setEnd(NewEnd);
-
-    // Clear the RHS.
-    RHS.clear();
-
-    return *this;
-  }
-
-  // If we have to grow to have enough elements, destroy the current elements.
-  // This allows us to avoid copying them during the grow.
-  // FIXME: this may not actually make any sense if we can efficiently move
-  // elements.
-  if (this->capacity() < RHSSize) {
-    // Destroy current elements.
-    this->destroy_range(this->begin(), this->end());
-    this->setEnd(this->begin());
-    CurSize = 0;
-    this->grow(RHSSize);
-  } else if (CurSize) {
-    // Otherwise, use assignment for the already-constructed elements.
-    std::move(RHS.begin(), RHS.begin() + CurSize, this->begin());
-  }
-
-  // Move-construct the new elements in place.
-  this->uninitialized_move(
-      RHS.begin() + CurSize, RHS.end(), this->begin() + CurSize);
-
-  // Set end.
-  this->setEnd(this->begin() + RHSSize);
-
-  RHS.clear();
-  return *this;
-}
-
-/// Storage for the SmallVector elements which aren't contained in
-/// SmallVectorTemplateCommon. There are 'N-1' elements here. The remaining '1'
-/// element is in the base class. This is specialized for the N=1 and N=0 cases
-/// to avoid allocating unnecessary storage.
-template <typename T, unsigned N>
-struct SmallVectorStorage {
-  typename SmallVectorTemplateCommon<T>::U InlineElts[N - 1];
-};
-template <typename T>
-struct SmallVectorStorage<T, 1> {};
-template <typename T>
-struct SmallVectorStorage<T, 0> {};
-
-/// This is a 'vector' (really, a variable-sized array), optimized
-/// for the case when the array is small.  It contains some number of elements
-/// in-place, which allows it to avoid heap allocation when the actual number of
-/// elements is below that threshold.  This allows normal "small" cases to be
-/// fast without losing generality for large inputs.
-///
-/// Note that this does not attempt to be exception safe.
-///
-template <typename T, unsigned N>
-class SmallVector : public SmallVectorImpl<T> {
-  /// Inline space for elements which aren't stored in the base class.
-  SmallVectorStorage<T, N> Storage;
-
- public:
-  SmallVector() : SmallVectorImpl<T>(N) {}
-
-  explicit SmallVector(size_t Size, const T& Value = T())
-      : SmallVectorImpl<T>(N) {
-    this->assign(Size, Value);
-  }
-
-  template <
-      typename ItTy,
-      typename = typename std::enable_if<std::is_convertible<
-          typename std::iterator_traits<ItTy>::iterator_category,
-          std::input_iterator_tag>::value>::type>
-  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
-    this->append(S, E);
-  }
-
-  template <typename Container>
-  explicit SmallVector(Container&& c) : SmallVectorImpl<T>(N) {
-    this->append(c.begin(), c.end());
-  }
-
-  SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
-    this->assign(IL);
-  }
-
-  SmallVector(const SmallVector& RHS) : SmallVectorImpl<T>(N) {
-    if (!RHS.empty())
-      SmallVectorImpl<T>::operator=(RHS);
-  }
-
-  const SmallVector& operator=(const SmallVector& RHS) {
-    SmallVectorImpl<T>::operator=(RHS);
-    return *this;
-  }
-
-  SmallVector(SmallVector&& RHS) : SmallVectorImpl<T>(N) {
-    if (!RHS.empty())
-      SmallVectorImpl<T>::operator=(::std::move(RHS));
-  }
-
-  template <typename Container>
-  const SmallVector& operator=(const Container& RHS) {
-    this->assign(RHS.begin(), RHS.end());
-    return *this;
-  }
-
-  SmallVector(SmallVectorImpl<T>&& RHS) : SmallVectorImpl<T>(N) {
-    if (!RHS.empty())
-      SmallVectorImpl<T>::operator=(::std::move(RHS));
-  }
-
-  const SmallVector& operator=(SmallVector&& RHS) {
-    SmallVectorImpl<T>::operator=(::std::move(RHS));
-    return *this;
-  }
-
-  const SmallVector& operator=(SmallVectorImpl<T>&& RHS) {
-    SmallVectorImpl<T>::operator=(::std::move(RHS));
-    return *this;
-  }
-
-  template <typename Container>
-  const SmallVector& operator=(Container&& C) {
-    this->assign(C.begin(), C.end());
-    return *this;
-  }
-
-  const SmallVector& operator=(std::initializer_list<T> IL) {
-    this->assign(IL);
-    return *this;
-  }
-};
-
-template <typename T, unsigned N>
-inline size_t capacity_in_bytes(const SmallVector<T, N>& X) {
-  return X.capacity_in_bytes();
-}
-
-} // end namespace at
-
-namespace std {
-
-/// Implement std::swap in terms of SmallVector swap.
-template <typename T>
-inline void swap(at::SmallVectorImpl<T>& LHS, at::SmallVectorImpl<T>& RHS) {
-  LHS.swap(RHS);
-}
-
-/// Implement std::swap in terms of SmallVector swap.
-template <typename T, unsigned N>
-inline void swap(at::SmallVector<T, N>& LHS, at::SmallVector<T, N>& RHS) {
-  LHS.swap(RHS);
-}
-
-} // end namespace std
diff --git a/aten/src/ATen/core/UniqueVoidPtr.cpp b/aten/src/ATen/core/UniqueVoidPtr.cpp
deleted file mode 100644
index fd08f7e13d2bf8..00000000000000
--- a/aten/src/ATen/core/UniqueVoidPtr.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-#include <ATen/core/UniqueVoidPtr.h>
-
-namespace at {
-namespace detail {
-
-void deleteNothing(void*) {}
-
-} // namespace detail
-} // namespace at
diff --git a/aten/src/ATen/core/optional.h b/aten/src/ATen/core/optional.h
deleted file mode 100644
index 8b0a7bfc4ead31..00000000000000
--- a/aten/src/ATen/core/optional.h
+++ /dev/null
@@ -1,1027 +0,0 @@
-// Copyright (C) 2011 - 2012 Andrzej Krzemienski.
-//
-// Use, modification, and distribution is subject to the Boost Software
-// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
-// http://www.boost.org/LICENSE_1_0.txt)
-//
-// The idea and interface is based on Boost.Optional library
-// authored by Fernando Luis Cacciola Carballal
-//
-// From https://github.com/akrzemi1/Optional
-//
-// ATen:
-// - Move to `at` namespace.
-// - Remove macro use in line 478 because the nvcc device compiler cannot handle
-// it.
-
-#pragma once
-
-#include <cassert>
-#include <functional>
-#include <initializer_list>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-#define TR2_OPTIONAL_REQUIRES(...) \
-  typename std::enable_if<__VA_ARGS__::value, bool>::type = false
-
-#if defined __GNUC__ // NOTE: GNUC is also defined for Clang
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)
-#define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
-#elif (__GNUC__ > 4)
-#define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
-#endif
-#
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)
-#define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
-#elif (__GNUC__ > 4)
-#define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
-#endif
-#
-#if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) && (__GNUC_PATCHLEVEL__ >= 1)
-#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
-#elif (__GNUC__ == 4) && (__GNUC_MINOR__ >= 9)
-#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
-#elif (__GNUC__ > 4)
-#define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
-#endif
-#endif
-#
-#if defined __clang_major__
-#if (__clang_major__ == 3 && __clang_minor__ >= 5)
-#define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
-#elif (__clang_major__ > 3)
-#define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
-#endif
-#if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
-#define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
-#elif ( \
-    __clang_major__ == 3 && __clang_minor__ == 4 && __clang_patchlevel__ >= 2)
-#define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
-#endif
-#endif
-#
-#if defined _MSC_VER
-#if (_MSC_VER >= 1900)
-#define TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
-#endif
-#endif
-
-#if defined __clang__
-#if (__clang_major__ > 2) || (__clang_major__ == 2) && (__clang_minor__ >= 9)
-#define OPTIONAL_HAS_THIS_RVALUE_REFS 1
-#else
-#define OPTIONAL_HAS_THIS_RVALUE_REFS 0
-#endif
-#elif defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
-#define OPTIONAL_HAS_THIS_RVALUE_REFS 1
-#elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
-#define OPTIONAL_HAS_THIS_RVALUE_REFS 1
-#else
-#define OPTIONAL_HAS_THIS_RVALUE_REFS 0
-#endif
-
-#if defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
-#define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 1
-#define OPTIONAL_CONSTEXPR_INIT_LIST constexpr
-#else
-#define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 0
-#define OPTIONAL_CONSTEXPR_INIT_LIST
-#endif
-
-#if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ && (defined __cplusplus) && \
-    (__cplusplus != 201103L)
-#define OPTIONAL_HAS_MOVE_ACCESSORS 1
-#else
-#define OPTIONAL_HAS_MOVE_ACCESSORS 0
-#endif
-
-#// In C++11 constexpr implies const, so we need to make non-const members also non-constexpr
-#if (defined __cplusplus) && (__cplusplus == 201103L)
-#define OPTIONAL_MUTABLE_CONSTEXPR
-#else
-#define OPTIONAL_MUTABLE_CONSTEXPR constexpr
-#endif
-
-namespace at {
-
-// 20.5.4, optional for object types
-template <class T>
-class optional;
-
-// 20.5.5, optional for lvalue reference types
-template <class T>
-class optional<T&>;
-
-// workaround: std utility functions aren't constexpr yet
-template <class T>
-inline constexpr T&& constexpr_forward(
-    typename std::remove_reference<T>::type& t) noexcept {
-  return static_cast<T&&>(t);
-}
-
-template <class T>
-inline constexpr T&& constexpr_forward(
-    typename std::remove_reference<T>::type&& t) noexcept {
-  static_assert(!std::is_lvalue_reference<T>::value, "!!");
-  return static_cast<T&&>(t);
-}
-
-template <class T>
-inline constexpr typename std::remove_reference<T>::type&& constexpr_move(
-    T&& t) noexcept {
-  return static_cast<typename std::remove_reference<T>::type&&>(t);
-}
-
-#if defined NDEBUG
-#define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) (EXPR)
-#else
-#define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) \
-  ((CHECK) ? (EXPR) : ([] { assert(!#CHECK); }(), (EXPR)))
-#endif
-
-namespace detail_ {
-
-// static_addressof: a constexpr version of addressof
-template <typename T>
-struct has_overloaded_addressof {
-  template <class X>
-  constexpr static bool has_overload(...) {
-    return false;
-  }
-
-  template <class X, size_t S = sizeof(std::declval<X&>().operator&())>
-  constexpr static bool has_overload(bool) {
-    return true;
-  }
-
-  constexpr static bool value = has_overload<T>(true);
-};
-
-template <typename T, TR2_OPTIONAL_REQUIRES(!has_overloaded_addressof<T>)>
-constexpr T* static_addressof(T& ref) {
-  return &ref;
-}
-
-template <typename T, TR2_OPTIONAL_REQUIRES(has_overloaded_addressof<T>)>
-T* static_addressof(T& ref) {
-  return std::addressof(ref);
-}
-
-// the call to convert<A>(b) has return type A and converts b to type A iff b
-// decltype(b) is implicitly convertible to A
-template <class U>
-constexpr U convert(U v) {
-  return v;
-}
-
-} // namespace detail_
-
-constexpr struct trivial_init_t {
-} trivial_init{};
-
-// 20.5.6, In-place construction
-constexpr struct in_place_t {
-} in_place{};
-
-// 20.5.7, Disengaged state indicator
-struct nullopt_t {
-  struct init {};
-  constexpr explicit nullopt_t(init) {}
-};
-constexpr nullopt_t nullopt{nullopt_t::init()};
-
-// 20.5.8, class bad_optional_access
-class bad_optional_access : public std::logic_error {
- public:
-  explicit bad_optional_access(const std::string& what_arg)
-      : logic_error{what_arg} {}
-  explicit bad_optional_access(const char* what_arg) : logic_error{what_arg} {}
-};
-
-template <class T>
-union storage_t {
-  unsigned char dummy_;
-  T value_;
-
-  constexpr storage_t(trivial_init_t) noexcept : dummy_(){};
-
-  template <class... Args>
-  constexpr storage_t(Args&&... args)
-      : value_(constexpr_forward<Args>(args)...) {}
-
-  ~storage_t() {}
-};
-
-template <class T>
-union constexpr_storage_t {
-  unsigned char dummy_;
-  T value_;
-
-  constexpr constexpr_storage_t(trivial_init_t) noexcept : dummy_(){};
-
-  template <class... Args>
-  constexpr constexpr_storage_t(Args&&... args)
-      : value_(constexpr_forward<Args>(args)...) {}
-
-  ~constexpr_storage_t() = default;
-};
-
-template <class T>
-struct optional_base {
-  bool init_;
-  storage_t<T> storage_;
-
-  constexpr optional_base() noexcept : init_(false), storage_(trivial_init){};
-
-  explicit constexpr optional_base(const T& v) : init_(true), storage_(v) {}
-
-  explicit constexpr optional_base(T&& v)
-      : init_(true), storage_(constexpr_move(v)) {}
-
-  template <class... Args>
-  explicit optional_base(in_place_t, Args&&... args)
-      : init_(true), storage_(constexpr_forward<Args>(args)...) {}
-
-  template <
-      class U,
-      class... Args,
-      TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
-  explicit optional_base(
-      in_place_t,
-      std::initializer_list<U> il,
-      Args&&... args)
-      : init_(true), storage_(il, std::forward<Args>(args)...) {}
-
-  ~optional_base() {
-    if (init_)
-      storage_.value_.T::~T();
-  }
-};
-
-template <class T>
-struct constexpr_optional_base {
-  bool init_;
-  constexpr_storage_t<T> storage_;
-
-  constexpr constexpr_optional_base() noexcept
-      : init_(false), storage_(trivial_init){};
-
-  explicit constexpr constexpr_optional_base(const T& v)
-      : init_(true), storage_(v) {}
-
-  explicit constexpr constexpr_optional_base(T&& v)
-      : init_(true), storage_(constexpr_move(v)) {}
-
-  template <class... Args>
-  explicit constexpr constexpr_optional_base(in_place_t, Args&&... args)
-      : init_(true), storage_(constexpr_forward<Args>(args)...) {}
-
-  template <
-      class U,
-      class... Args,
-      TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
-  OPTIONAL_CONSTEXPR_INIT_LIST explicit constexpr_optional_base(
-      in_place_t,
-      std::initializer_list<U> il,
-      Args&&... args)
-      : init_(true), storage_(il, std::forward<Args>(args)...) {}
-
-  ~constexpr_optional_base() = default;
-};
-
-template <class T>
-using OptionalBase = typename std::conditional<
-    std::is_trivially_destructible<T>::value, // if possible
-    constexpr_optional_base<typename std::remove_const<
-        T>::type>, // use base with trivial destructor
-    optional_base<typename std::remove_const<T>::type>>::type;
-
-template <class T>
-class optional : private OptionalBase<T> {
-  static_assert(
-      !std::is_same<typename std::decay<T>::type, nullopt_t>::value,
-      "bad T");
-  static_assert(
-      !std::is_same<typename std::decay<T>::type, in_place_t>::value,
-      "bad T");
-
-  constexpr bool initialized() const noexcept {
-    return OptionalBase<T>::init_;
-  }
-  typename std::remove_const<T>::type* dataptr() {
-    return std::addressof(OptionalBase<T>::storage_.value_);
-  }
-  constexpr const T* dataptr() const {
-    return detail_::static_addressof(OptionalBase<T>::storage_.value_);
-  }
-
-#if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
-  constexpr const T& contained_val() const& {
-    return OptionalBase<T>::storage_.value_;
-  }
-#if OPTIONAL_HAS_MOVE_ACCESSORS == 1
-  OPTIONAL_MUTABLE_CONSTEXPR T&& contained_val() && {
-    return std::move(OptionalBase<T>::storage_.value_);
-  }
-  OPTIONAL_MUTABLE_CONSTEXPR T& contained_val() & {
-    return OptionalBase<T>::storage_.value_;
-  }
-#else
-  T& contained_val() & {
-    return OptionalBase<T>::storage_.value_;
-  }
-  T&& contained_val() && {
-    return std::move(OptionalBase<T>::storage_.value_);
-  }
-#endif
-#else
-  constexpr const T& contained_val() const {
-    return OptionalBase<T>::storage_.value_;
-  }
-  T& contained_val() {
-    return OptionalBase<T>::storage_.value_;
-  }
-#endif
-
-  void clear() noexcept {
-    if (initialized())
-      dataptr()->T::~T();
-    OptionalBase<T>::init_ = false;
-  }
-
-  template <class... Args>
-  void initialize(Args&&... args) noexcept(
-      noexcept(T(std::forward<Args>(args)...))) {
-    assert(!OptionalBase<T>::init_);
-    ::new (static_cast<void*>(dataptr())) T(std::forward<Args>(args)...);
-    OptionalBase<T>::init_ = true;
-  }
-
-  template <class U, class... Args>
-  void initialize(std::initializer_list<U> il, Args&&... args) noexcept(
-      noexcept(T(il, std::forward<Args>(args)...))) {
-    assert(!OptionalBase<T>::init_);
-    ::new (static_cast<void*>(dataptr())) T(il, std::forward<Args>(args)...);
-    OptionalBase<T>::init_ = true;
-  }
-
- public:
-  typedef T value_type;
-
-  // 20.5.5.1, constructors
-  constexpr optional() noexcept : OptionalBase<T>(){};
-  constexpr optional(nullopt_t) noexcept : OptionalBase<T>(){};
-
-  optional(const optional& rhs) : OptionalBase<T>() {
-    if (rhs.initialized()) {
-      ::new (static_cast<void*>(dataptr())) T(*rhs);
-      OptionalBase<T>::init_ = true;
-    }
-  }
-
-  optional(optional&& rhs) noexcept(
-      std::is_nothrow_move_constructible<T>::value)
-      : OptionalBase<T>() {
-    if (rhs.initialized()) {
-      ::new (static_cast<void*>(dataptr())) T(std::move(*rhs));
-      OptionalBase<T>::init_ = true;
-    }
-  }
-
-  constexpr optional(const T& v) : OptionalBase<T>(v) {}
-
-  constexpr optional(T&& v) : OptionalBase<T>(constexpr_move(v)) {}
-
-  template <class... Args>
-  explicit constexpr optional(in_place_t, Args&&... args)
-      : OptionalBase<T>(in_place_t{}, constexpr_forward<Args>(args)...) {}
-
-  template <
-      class U,
-      class... Args,
-      TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
-  OPTIONAL_CONSTEXPR_INIT_LIST explicit optional(
-      in_place_t,
-      std::initializer_list<U> il,
-      Args&&... args)
-      : OptionalBase<T>(in_place_t{}, il, constexpr_forward<Args>(args)...) {}
-
-  // 20.5.4.2, Destructor
-  ~optional() = default;
-
-  // 20.5.4.3, assignment
-  optional& operator=(nullopt_t) noexcept {
-    clear();
-    return *this;
-  }
-
-  optional& operator=(const optional& rhs) {
-    if (initialized() == true && rhs.initialized() == false)
-      clear();
-    else if (initialized() == false && rhs.initialized() == true)
-      initialize(*rhs);
-    else if (initialized() == true && rhs.initialized() == true)
-      contained_val() = *rhs;
-    return *this;
-  }
-
-  optional& operator=(optional&& rhs) noexcept(
-      std::is_nothrow_move_assignable<T>::value&&
-          std::is_nothrow_move_constructible<T>::value) {
-    if (initialized() == true && rhs.initialized() == false)
-      clear();
-    else if (initialized() == false && rhs.initialized() == true)
-      initialize(std::move(*rhs));
-    else if (initialized() == true && rhs.initialized() == true)
-      contained_val() = std::move(*rhs);
-    return *this;
-  }
-
-  template <class U>
-  auto operator=(U&& v) -> typename std::enable_if<
-      std::is_same<typename std::decay<U>::type, T>::value,
-      optional&>::type {
-    if (initialized()) {
-      contained_val() = std::forward<U>(v);
-    } else {
-      initialize(std::forward<U>(v));
-    }
-    return *this;
-  }
-
-  template <class... Args>
-  void emplace(Args&&... args) {
-    clear();
-    initialize(std::forward<Args>(args)...);
-  }
-
-  template <class U, class... Args>
-  void emplace(std::initializer_list<U> il, Args&&... args) {
-    clear();
-    initialize<U, Args...>(il, std::forward<Args>(args)...);
-  }
-
-  // 20.5.4.4, Swap
-  void swap(optional<T>& rhs) noexcept(
-      std::is_nothrow_move_constructible<T>::value&& noexcept(
-          swap(std::declval<T&>(), std::declval<T&>()))) {
-    if (initialized() == true && rhs.initialized() == false) {
-      rhs.initialize(std::move(**this));
-      clear();
-    } else if (initialized() == false && rhs.initialized() == true) {
-      initialize(std::move(*rhs));
-      rhs.clear();
-    } else if (initialized() == true && rhs.initialized() == true) {
-      using std::swap;
-      swap(**this, *rhs);
-    }
-  }
-
-  // 20.5.4.5, Observers
-
-  explicit constexpr operator bool() const noexcept {
-    return initialized();
-  }
-  constexpr bool has_value() const noexcept {
-    return initialized();
-  }
-
-  constexpr T const* operator->() const {
-    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), dataptr());
-  }
-
-#if OPTIONAL_HAS_MOVE_ACCESSORS == 1
-
-  OPTIONAL_MUTABLE_CONSTEXPR T* operator->() {
-    assert(initialized());
-    return dataptr();
-  }
-
-  constexpr T const& operator*() const& {
-    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val());
-  }
-
-  OPTIONAL_MUTABLE_CONSTEXPR T& operator*() & {
-    assert(initialized());
-    return contained_val();
-  }
-
-  OPTIONAL_MUTABLE_CONSTEXPR T&& operator*() && {
-    assert(initialized());
-    return constexpr_move(contained_val());
-  }
-
-  constexpr T const& value() const& {
-    return initialized()
-        ? contained_val()
-        : (throw bad_optional_access("bad optional access"), contained_val());
-  }
-
-  OPTIONAL_MUTABLE_CONSTEXPR T& value() & {
-    return initialized()
-        ? contained_val()
-        : (throw bad_optional_access("bad optional access"), contained_val());
-  }
-
-  OPTIONAL_MUTABLE_CONSTEXPR T&& value() && {
-    if (!initialized())
-      throw bad_optional_access("bad optional access");
-    return std::move(contained_val());
-  }
-
-#else
-
-  T* operator->() {
-    assert(initialized());
-    return dataptr();
-  }
-
-  constexpr T const& operator*() const {
-    return contained_val();
-  }
-
-  T& operator*() {
-    assert(initialized());
-    return contained_val();
-  }
-
-  constexpr T const& value() const {
-    return initialized()
-        ? contained_val()
-        : (throw bad_optional_access("bad optional access"), contained_val());
-  }
-
-  T& value() {
-    return initialized()
-        ? contained_val()
-        : (throw bad_optional_access("bad optional access"), contained_val());
-  }
-
-#endif
-
-#if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
-
-  template <class V>
-  constexpr T value_or(V&& v) const& {
-    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
-  }
-
-#if OPTIONAL_HAS_MOVE_ACCESSORS == 1
-
-  template <class V>
-  OPTIONAL_MUTABLE_CONSTEXPR T value_or(V&& v) && {
-    return *this
-        ? constexpr_move(const_cast<optional<T>&>(*this).contained_val())
-        : detail_::convert<T>(constexpr_forward<V>(v));
-  }
-
-#else
-
-  template <class V>
-  T value_or(V&& v) && {
-    return *this
-        ? constexpr_move(const_cast<optional<T>&>(*this).contained_val())
-        : detail_::convert<T>(constexpr_forward<V>(v));
-  }
-
-#endif
-
-#else
-
-  template <class V>
-  constexpr T value_or(V&& v) const {
-    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
-  }
-
-#endif
-
-  // 20.6.3.6, modifiers
-  void reset() noexcept {
-    clear();
-  }
-};
-
-template <class T>
-class optional<T&> {
-  static_assert(!std::is_same<T, nullopt_t>::value, "bad T");
-  static_assert(!std::is_same<T, in_place_t>::value, "bad T");
-  T* ref;
-
- public:
-  // 20.5.5.1, construction/destruction
-  constexpr optional() noexcept : ref(nullptr) {}
-
-  constexpr optional(nullopt_t) noexcept : ref(nullptr) {}
-
-  constexpr optional(T& v) noexcept : ref(detail_::static_addressof(v)) {}
-
-  optional(T&&) = delete;
-
-  constexpr optional(const optional& rhs) noexcept : ref(rhs.ref) {}
-
-  explicit constexpr optional(in_place_t, T& v) noexcept
-      : ref(detail_::static_addressof(v)) {}
-
-  explicit optional(in_place_t, T&&) = delete;
-
-  ~optional() = default;
-
-  // 20.5.5.2, mutation
-  optional& operator=(nullopt_t) noexcept {
-    ref = nullptr;
-    return *this;
-  }
-
-  // optional& operator=(const optional& rhs) noexcept {
-  // ref = rhs.ref;
-  // return *this;
-  // }
-
-  // optional& operator=(optional&& rhs) noexcept {
-  // ref = rhs.ref;
-  // return *this;
-  // }
-
-  template <typename U>
-  auto operator=(U&& rhs) noexcept -> typename std::enable_if<
-      std::is_same<typename std::decay<U>::type, optional<T&>>::value,
-      optional&>::type {
-    ref = rhs.ref;
-    return *this;
-  }
-
-  template <typename U>
-  auto operator=(U&& rhs) noexcept -> typename std::enable_if<
-      !std::is_same<typename std::decay<U>::type, optional<T&>>::value,
-      optional&>::type = delete;
-
-  void emplace(T& v) noexcept {
-    ref = detail_::static_addressof(v);
-  }
-
-  void emplace(T&&) = delete;
-
-  void swap(optional<T&>& rhs) noexcept {
-    std::swap(ref, rhs.ref);
-  }
-
-  // 20.5.5.3, observers
-  constexpr T* operator->() const {
-    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, ref);
-  }
-
-  constexpr T& operator*() const {
-    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, *ref);
-  }
-
-  constexpr T& value() const {
-    return ref ? *ref
-               : (throw bad_optional_access("bad optional access"), *ref);
-  }
-
-  explicit constexpr operator bool() const noexcept {
-    return ref != nullptr;
-  }
-
-  constexpr bool has_value() const noexcept {
-    return ref != nullptr;
-  }
-
-  template <class V>
-  constexpr typename std::decay<T>::type value_or(V&& v) const {
-    return *this ? **this
-                 : detail_::convert<typename std::decay<T>::type>(
-                       constexpr_forward<V>(v));
-  }
-
-  // x.x.x.x, modifiers
-  void reset() noexcept {
-    ref = nullptr;
-  }
-};
-
-template <class T>
-class optional<T&&> {
-  static_assert(sizeof(T) == 0, "optional rvalue references disallowed");
-};
-
-// 20.5.8, Relational operators
-template <class T>
-constexpr bool operator==(const optional<T>& x, const optional<T>& y) {
-  return bool(x) != bool(y) ? false : bool(x) == false ? true : *x == *y;
-}
-
-template <class T>
-constexpr bool operator!=(const optional<T>& x, const optional<T>& y) {
-  return !(x == y);
-}
-
-template <class T>
-constexpr bool operator<(const optional<T>& x, const optional<T>& y) {
-  return (!y) ? false : (!x) ? true : *x < *y;
-}
-
-template <class T>
-constexpr bool operator>(const optional<T>& x, const optional<T>& y) {
-  return (y < x);
-}
-
-template <class T>
-constexpr bool operator<=(const optional<T>& x, const optional<T>& y) {
-  return !(y < x);
-}
-
-template <class T>
-constexpr bool operator>=(const optional<T>& x, const optional<T>& y) {
-  return !(x < y);
-}
-
-// 20.5.9, Comparison with nullopt
-template <class T>
-constexpr bool operator==(const optional<T>& x, nullopt_t) noexcept {
-  return (!x);
-}
-
-template <class T>
-constexpr bool operator==(nullopt_t, const optional<T>& x) noexcept {
-  return (!x);
-}
-
-template <class T>
-constexpr bool operator!=(const optional<T>& x, nullopt_t) noexcept {
-  return bool(x);
-}
-
-template <class T>
-constexpr bool operator!=(nullopt_t, const optional<T>& x) noexcept {
-  return bool(x);
-}
-
-template <class T>
-constexpr bool operator<(const optional<T>&, nullopt_t) noexcept {
-  return false;
-}
-
-template <class T>
-constexpr bool operator<(nullopt_t, const optional<T>& x) noexcept {
-  return bool(x);
-}
-
-template <class T>
-constexpr bool operator<=(const optional<T>& x, nullopt_t) noexcept {
-  return (!x);
-}
-
-template <class T>
-constexpr bool operator<=(nullopt_t, const optional<T>&) noexcept {
-  return true;
-}
-
-template <class T>
-constexpr bool operator>(const optional<T>& x, nullopt_t) noexcept {
-  return bool(x);
-}
-
-template <class T>
-constexpr bool operator>(nullopt_t, const optional<T>&) noexcept {
-  return false;
-}
-
-template <class T>
-constexpr bool operator>=(const optional<T>&, nullopt_t) noexcept {
-  return true;
-}
-
-template <class T>
-constexpr bool operator>=(nullopt_t, const optional<T>& x) noexcept {
-  return (!x);
-}
-
-// 20.5.10, Comparison with T
-template <class T>
-constexpr bool operator==(const optional<T>& x, const T& v) {
-  return bool(x) ? *x == v : false;
-}
-
-template <class T>
-constexpr bool operator==(const T& v, const optional<T>& x) {
-  return bool(x) ? v == *x : false;
-}
-
-template <class T>
-constexpr bool operator!=(const optional<T>& x, const T& v) {
-  return bool(x) ? *x != v : true;
-}
-
-template <class T>
-constexpr bool operator!=(const T& v, const optional<T>& x) {
-  return bool(x) ? v != *x : true;
-}
-
-template <class T>
-constexpr bool operator<(const optional<T>& x, const T& v) {
-  return bool(x) ? *x < v : true;
-}
-
-template <class T>
-constexpr bool operator>(const T& v, const optional<T>& x) {
-  return bool(x) ? v > *x : true;
-}
-
-template <class T>
-constexpr bool operator>(const optional<T>& x, const T& v) {
-  return bool(x) ? *x > v : false;
-}
-
-template <class T>
-constexpr bool operator<(const T& v, const optional<T>& x) {
-  return bool(x) ? v < *x : false;
-}
-
-template <class T>
-constexpr bool operator>=(const optional<T>& x, const T& v) {
-  return bool(x) ? *x >= v : false;
-}
-
-template <class T>
-constexpr bool operator<=(const T& v, const optional<T>& x) {
-  return bool(x) ? v <= *x : false;
-}
-
-template <class T>
-constexpr bool operator<=(const optional<T>& x, const T& v) {
-  return bool(x) ? *x <= v : true;
-}
-
-template <class T>
-constexpr bool operator>=(const T& v, const optional<T>& x) {
-  return bool(x) ? v >= *x : true;
-}
-
-// Comparison of optional<T&> with T
-template <class T>
-constexpr bool operator==(const optional<T&>& x, const T& v) {
-  return bool(x) ? *x == v : false;
-}
-
-template <class T>
-constexpr bool operator==(const T& v, const optional<T&>& x) {
-  return bool(x) ? v == *x : false;
-}
-
-template <class T>
-constexpr bool operator!=(const optional<T&>& x, const T& v) {
-  return bool(x) ? *x != v : true;
-}
-
-template <class T>
-constexpr bool operator!=(const T& v, const optional<T&>& x) {
-  return bool(x) ? v != *x : true;
-}
-
-template <class T>
-constexpr bool operator<(const optional<T&>& x, const T& v) {
-  return bool(x) ? *x < v : true;
-}
-
-template <class T>
-constexpr bool operator>(const T& v, const optional<T&>& x) {
-  return bool(x) ? v > *x : true;
-}
-
-template <class T>
-constexpr bool operator>(const optional<T&>& x, const T& v) {
-  return bool(x) ? *x > v : false;
-}
-
-template <class T>
-constexpr bool operator<(const T& v, const optional<T&>& x) {
-  return bool(x) ? v < *x : false;
-}
-
-template <class T>
-constexpr bool operator>=(const optional<T&>& x, const T& v) {
-  return bool(x) ? *x >= v : false;
-}
-
-template <class T>
-constexpr bool operator<=(const T& v, const optional<T&>& x) {
-  return bool(x) ? v <= *x : false;
-}
-
-template <class T>
-constexpr bool operator<=(const optional<T&>& x, const T& v) {
-  return bool(x) ? *x <= v : true;
-}
-
-template <class T>
-constexpr bool operator>=(const T& v, const optional<T&>& x) {
-  return bool(x) ? v >= *x : true;
-}
-
-// Comparison of optional<T const&> with T
-template <class T>
-constexpr bool operator==(const optional<const T&>& x, const T& v) {
-  return bool(x) ? *x == v : false;
-}
-
-template <class T>
-constexpr bool operator==(const T& v, const optional<const T&>& x) {
-  return bool(x) ? v == *x : false;
-}
-
-template <class T>
-constexpr bool operator!=(const optional<const T&>& x, const T& v) {
-  return bool(x) ? *x != v : true;
-}
-
-template <class T>
-constexpr bool operator!=(const T& v, const optional<const T&>& x) {
-  return bool(x) ? v != *x : true;
-}
-
-template <class T>
-constexpr bool operator<(const optional<const T&>& x, const T& v) {
-  return bool(x) ? *x < v : true;
-}
-
-template <class T>
-constexpr bool operator>(const T& v, const optional<const T&>& x) {
-  return bool(x) ? v > *x : true;
-}
-
-template <class T>
-constexpr bool operator>(const optional<const T&>& x, const T& v) {
-  return bool(x) ? *x > v : false;
-}
-
-template <class T>
-constexpr bool operator<(const T& v, const optional<const T&>& x) {
-  return bool(x) ? v < *x : false;
-}
-
-template <class T>
-constexpr bool operator>=(const optional<const T&>& x, const T& v) {
-  return bool(x) ? *x >= v : false;
-}
-
-template <class T>
-constexpr bool operator<=(const T& v, const optional<const T&>& x) {
-  return bool(x) ? v <= *x : false;
-}
-
-template <class T>
-constexpr bool operator<=(const optional<const T&>& x, const T& v) {
-  return bool(x) ? *x <= v : true;
-}
-
-template <class T>
-constexpr bool operator>=(const T& v, const optional<const T&>& x) {
-  return bool(x) ? v >= *x : true;
-}
-
-// 20.5.12, Specialized algorithms
-template <class T>
-void swap(optional<T>& x, optional<T>& y) noexcept(noexcept(x.swap(y))) {
-  x.swap(y);
-}
-
-template <class T>
-constexpr optional<typename std::decay<T>::type> make_optional(T&& v) {
-  return optional<typename std::decay<T>::type>(constexpr_forward<T>(v));
-}
-
-template <class X>
-constexpr optional<X&> make_optional(std::reference_wrapper<X> v) {
-  return optional<X&>(v.get());
-}
-
-} // namespace at
-
-namespace std {
-template <typename T>
-struct hash<at::optional<T>> {
-  typedef typename hash<T>::result_type result_type;
-  typedef at::optional<T> argument_type;
-
-  constexpr result_type operator()(argument_type const& arg) const {
-    return arg ? std::hash<T>{}(*arg) : result_type{};
-  }
-};
-
-template <typename T>
-struct hash<at::optional<T&>> {
-  typedef typename hash<T>::result_type result_type;
-  typedef at::optional<T&> argument_type;
-
-  constexpr result_type operator()(argument_type const& arg) const {
-    return arg ? std::hash<T>{}(*arg) : result_type{};
-  }
-};
-} // namespace std
-
-#undef TR2_OPTIONAL_REQUIRES
-#undef TR2_OPTIONAL_ASSERTED_EXPRESSION
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
deleted file mode 100644
index eed9f677a2ef18..00000000000000
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#pragma once
-// Contents of this file are copied from THCUNN/common.h for the ease of porting
-// THCUNN functions into ATen.
-
-namespace at { namespace cuda { namespace detail {
-
-// CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
-
-// Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
-
-// CUDA: number of blocks for threads.
-inline int GET_BLOCKS(const int N)
-{
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-
-}}}  // namespace at::cuda::detail
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index 7ce3da3c9e051c..085f2723bf0455 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -319,20 +319,6 @@ struct AT_CUDA_API RNNDescriptor
   }
 };
 
-#if CUDNN_VERSION >= 7000
-
-struct AT_CUDA_API CTCLossDescriptor
-  : public Descriptor<cudnnCTCLossStruct,
-                      &cudnnCreateCTCLossDescriptor,
-                      &cudnnDestroyCTCLossDescriptor>
-{
-  void set(cudnnDataType_t datatype) {
-    AT_CUDNN_CHECK(cudnnSetCTCLossDescriptor(mut_desc(), datatype));
-  }
-};
-
-#endif
-
 union Constant
 {
   float f;
diff --git a/aten/src/ATen/detail/UniqueVoidPtr.cpp b/aten/src/ATen/detail/UniqueVoidPtr.cpp
new file mode 100644
index 00000000000000..07531d826367ae
--- /dev/null
+++ b/aten/src/ATen/detail/UniqueVoidPtr.cpp
@@ -0,0 +1,7 @@
+#include <ATen/detail/UniqueVoidPtr.h>
+
+namespace at { namespace detail {
+
+void deleteNothing(void*) {}
+
+}} // namespace at
diff --git a/aten/src/ATen/core/UniqueVoidPtr.h b/aten/src/ATen/detail/UniqueVoidPtr.h
similarity index 77%
rename from aten/src/ATen/core/UniqueVoidPtr.h
rename to aten/src/ATen/detail/UniqueVoidPtr.h
index 299c729e125a58..e277014a7935d6 100644
--- a/aten/src/ATen/core/UniqueVoidPtr.h
+++ b/aten/src/ATen/detail/UniqueVoidPtr.h
@@ -1,15 +1,15 @@
 #include <memory>
 
-#include <ATen/core/CoreAPI.h>
+#include <ATen/ATenGeneral.h>
 
 namespace at {
 
-using DeleterFnPtr = void (*)(void*);
+using DeleterFnPtr = void(*)(void*);
 
 namespace detail {
 
 // Does not delete anything
-AT_CORE_API void deleteNothing(void*);
+AT_API void deleteNothing(void*);
 
 // A detail::UniqueVoidPtr is an owning smart pointer like unique_ptr, but
 // with three major differences:
@@ -35,47 +35,33 @@ AT_CORE_API void deleteNothing(void*);
 // to reflect this.
 //
 class UniqueVoidPtr {
- private:
+private:
   // Lifetime tied to ctx_
   void* data_;
   std::unique_ptr<void, DeleterFnPtr> ctx_;
-
- public:
+public:
   UniqueVoidPtr() : data_(nullptr), ctx_(nullptr, &deleteNothing) {}
-  explicit UniqueVoidPtr(void* data)
-      : data_(data), ctx_(nullptr, &deleteNothing) {}
+  explicit UniqueVoidPtr(void* data) : data_(data), ctx_(nullptr, &deleteNothing) {}
   UniqueVoidPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter)
-      : data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &deleteNothing) {}
-  void* operator->() const {
-    return data_;
-  }
+    : data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &deleteNothing) {}
+  void* operator->() const { return data_; }
   void clear() {
     ctx_ = nullptr;
     data_ = nullptr;
   }
-  void* get() const {
-    return data_;
-  }
-  void* get_context() const {
-    return ctx_.get();
-  }
-  void* release_context() {
-    return ctx_.release();
-  }
+  void* get() const { return data_; }
+  void* get_context() const { return ctx_.get(); }
+  void* release_context() { return ctx_.release(); }
   template <typename T>
   T* cast_context(DeleterFnPtr expected_deleter) const {
-    if (get_deleter() != expected_deleter)
-      return nullptr;
+    if (get_deleter() != expected_deleter) return nullptr;
     return static_cast<T*>(get_context());
   }
-  operator bool() const {
-    return data_ || ctx_;
-  }
-  DeleterFnPtr get_deleter() const {
-    return ctx_.get_deleter();
-  }
+  operator bool() const { return data_ || ctx_; }
+  DeleterFnPtr get_deleter() const { return ctx_.get_deleter(); }
 };
 
+
 // Note [How UniqueVoidPtr is implemented]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // UniqueVoidPtr solves a common problem for allocators of tensor data, which
@@ -94,18 +80,9 @@ class UniqueVoidPtr {
 // pointer itself.  In simple cases, the context pointer is just the pointer
 // itself.
 
-inline bool operator==(const UniqueVoidPtr& sp, std::nullptr_t) noexcept {
-  return !sp;
-}
-inline bool operator==(std::nullptr_t, const UniqueVoidPtr& sp) noexcept {
-  return !sp;
-}
-inline bool operator!=(const UniqueVoidPtr& sp, std::nullptr_t) noexcept {
-  return sp;
-}
-inline bool operator!=(std::nullptr_t, const UniqueVoidPtr& sp) noexcept {
-  return sp;
-}
+inline bool operator==(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { return !sp; }
+inline bool operator==(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { return !sp; }
+inline bool operator!=(const UniqueVoidPtr& sp, std::nullptr_t) noexcept { return sp; }
+inline bool operator!=(std::nullptr_t, const UniqueVoidPtr& sp) noexcept { return sp; }
 
-} // namespace detail
-} // namespace at
+}} // namespace at::detail
diff --git a/aten/src/ATen/detail/VariableHooksInterface.h b/aten/src/ATen/detail/VariableHooksInterface.h
index 836dacb97766ec..287116490397f3 100644
--- a/aten/src/ATen/detail/VariableHooksInterface.h
+++ b/aten/src/ATen/detail/VariableHooksInterface.h
@@ -3,7 +3,6 @@
 #include <ATen/Registry.h>
 #include <ATen/Error.h>
 #include <ATen/ScalarType.h>
-#include <ATen/Type.h>
 
 namespace at {
   class Context;
@@ -26,10 +25,6 @@ struct AT_API VariableHooksInterface {
   // squelch -Werror=non-virtual-dtor
   virtual ~VariableHooksInterface() {}
 
-  virtual Type& getVariableType(const at::Type& baseType) const {
-    AT_ERROR("cannot getVariableType without libtorch");
-  }
-
   virtual void registerVariableTypeFor(Context*, Backend backend, ScalarType scalar_type) const {
     // no-op if Variable not available; it'll get handled (if at all) when
     // libtorch.so gets loaded
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index b012de25194361..93c20d4be032f4 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -290,7 +290,7 @@ def __init__(self, reason):
             'Backend::${DenseBackend}, ScalarType::Long)'),
     'THStorage*':
         CodeTemplate(
-            'checked_cast_storage<Storage>('
+            'checked_cast_storage<${Storage}>('
             '&${arg_name},"${arg_name}",${arg_pos}, '
             'Backend::${Backend}, ScalarType::${ScalarName})'),
     'THGenerator*':
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index 209cca57c293ff..0f2aaffd6eac9d 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -103,6 +103,10 @@ def check_all_files_written(self):
 TEMPLATE_PATH = options.source_path + "/templates"
 GENERATOR_DERIVED = CodeTemplate.from_file(
     TEMPLATE_PATH + "/GeneratorDerived.h")
+STORAGE_DERIVED_CPP = CodeTemplate.from_file(
+    TEMPLATE_PATH + "/StorageDerived.cpp")
+STORAGE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/StorageDerived.h")
+
 TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.cpp")
 SPARSE_TYPE_DERIVED_CPP = CodeTemplate.from_file(TEMPLATE_PATH + "/SparseTypeDerived.cpp")
 TYPE_DERIVED_H = CodeTemplate.from_file(TEMPLATE_PATH + "/TypeDerived.h")
@@ -233,6 +237,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
     env['isFloatingType'] = is_floating_type
     env['isIntegralType'] = not is_floating_type
     if density == 'Dense':
+        env['Storage'] = "{}{}Storage".format(backend, scalar_name)
         env['Tensor'] = "{}{}{}Tensor".format(density_tag, backend, scalar_name)
     env['Type'] = "{}{}{}Type".format(density_tag, backend, scalar_name)
     env['DenseTensor'] = "{}{}Tensor".format(backend, scalar_name)
@@ -241,6 +246,7 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
     env['storage_tensor_headers'] = []
     if density != 'Sparse':
         env['storage_tensor_headers'] = [
+            '#include "ATen/{}.h"'.format(env['Storage']),
             '#include "ATen/{}.h"'.format(env['Tensor']),
             '#include "ATen/{}ByteTensor.h"'.format(env['Backend']),
             '#include "ATen/{}IntTensor.h"'.format(env['Backend']),
@@ -316,6 +322,8 @@ def generate_storage_type_and_tensor(backend, density, scalar_type, declarations
 
     if density != 'Sparse':
         # there are no storage or tensor types for sparse; it's all uniform
+        fm.write(env['Storage'] + ".cpp", STORAGE_DERIVED_CPP, env)
+        fm.write(env['Storage'] + ".h", STORAGE_DERIVED_H, env)
         env['TensorDenseOrSparse'] = TENSOR_DENSE_CPP.substitute(env)
         fm.write(env['Tensor'] + ".cpp", TENSOR_DERIVED_CPP, env)
         fm.write(env['Tensor'] + ".h", TENSOR_DERIVED_H, env)
@@ -371,7 +379,7 @@ def declare_outputs():
     for backend, density, scalar_types in iterate_types():
         scalar_name = scalar_types[0]
         full_backend = "Sparse" + backend if density == "Sparse" else backend
-        for kind in ["Type", "Tensor"]:
+        for kind in ["Storage", "Type", "Tensor"]:
             if kind != 'Type' and density == "Sparse":
                 # No Storage or Tensor for sparse
                 continue
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index 36f1e4c0bf86de..a3dc735ab1e4cb 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -25,16 +25,6 @@ Tensor & selu_(Tensor & self) {
   return at::elu_(self, SELU_ALPHA, SELU_SCALE);
 }
 
-Tensor celu(const Tensor & self, Scalar alpha) {
-  double inv_alpha = 1. / alpha.to<double>();
-  return at::elu(self, 1.0, alpha, Scalar(inv_alpha));
-}
-
-Tensor & celu_(Tensor & self, Scalar alpha) {
-  double inv_alpha = 1. / alpha.to<double>();
-  return at::elu_(self, 1.0, alpha, Scalar(inv_alpha));
-}
-
 Tensor rrelu(const Tensor & self, Scalar lower, Scalar upper, bool training, Generator* generator) {
   return at::rrelu_with_noise(self, self.type().tensor(), lower, upper, training, generator);
 }
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 4028e989b87022..a537691f748171 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -402,11 +402,11 @@ at::Tensor _convolution_nogroup(
     bool transposed, IntList output_padding) {
 
   ConvParams params;
-  params.stride = stride.vec();
-  params.padding = padding.vec();
-  params.dilation = dilation.vec();
+  params.stride = stride;
+  params.padding = padding;
+  params.dilation = dilation;
   params.transposed = transposed;
-  params.output_padding = output_padding.vec();
+  params.output_padding = output_padding;
   params.groups = 1;
   params.benchmark = false;
   params.deterministic = false;
@@ -474,11 +474,11 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward(
   auto weight = weight_r;
 
   ConvParams params;
-  params.stride = stride_.vec();
-  params.padding = padding_.vec();
-  params.dilation = dilation_.vec();
+  params.stride = stride_;
+  params.padding = padding_;
+  params.dilation = dilation_;
   params.transposed = transposed_;
-  params.output_padding = output_padding_.vec();
+  params.output_padding = output_padding_;
   params.groups = groups_;
   params.benchmark = benchmark;
   params.deterministic = deterministic;
diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h
index c374740a3ce7d1..7a6e0788531172 100644
--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@@ -57,7 +57,6 @@ deviceforcuda scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t>& st
 
   // Boost alpha for higher acceptance probability.
   if (alpha < 1.0f) {
-    if (alpha == 0.f) return 0.f;
     scale *= std::pow(1 - standard_uniform.sample(), 1.0f / alpha);
     alpha += 1.0f;
   }
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index 0026a9907d7eca..7599386ee74172 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -24,7 +24,7 @@ Tensor embedding(const Tensor & weight, const Tensor & indices,
     return weight.index_select(0, indices);
   }
 
-  auto size = indices.sizes().vec();
+  auto size = std::vector<int64_t>(indices.sizes());
   for (auto d : weight.sizes().slice(1)) {
     size.push_back(d);
   }
diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp
deleted file mode 100644
index 5f1c8255772dcf..00000000000000
--- a/aten/src/ATen/native/GridSampler.cpp
+++ /dev/null
@@ -1,780 +0,0 @@
-#include "ATen/ATen.h"
-#include "ATen/NativeFunctions.h"
-#include "ATen/detail/CUDAHooksInterface.h"
-#include "ATen/native/GridSampler.h"
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace at { namespace native {
-
-using at::native::detail::GridSamplerInterpolation;
-using at::native::detail::GridSamplerPadding;
-
-namespace {
-  static inline int64_t clip_coordinates(int64_t in, int64_t clip_limit) {
-    return std::min(clip_limit - 1, std::max(in, static_cast<int64_t>(0)));
-  }
-
-  static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) {
-    return h >= 0 && h < H && w >= 0 && w < W;
-  }
-
-  static inline bool within_bounds_3d(int64_t d, int64_t h, int64_t w, int64_t D, int64_t H, int64_t W) {
-    return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
-  }
-
-  template<typename scalar_t>
-  static inline void safe_add_2d(scalar_t *data, int64_t h, int64_t w,
-                                 int64_t sH, int64_t sW, int64_t H, int64_t W,
-                                 scalar_t delta) {
-    if (within_bounds_2d(h, w, H, W)) {
-      data[h * sH + w * sW] += delta;
-    }
-  }
-
-  template<typename scalar_t>
-  static inline void safe_add_3d(scalar_t *data, int64_t d, int64_t h, int64_t w,
-                                 int64_t sD, int64_t sH, int64_t sW,
-                                 int64_t D, int64_t H, int64_t W,
-                                 scalar_t delta) {
-    if (within_bounds_3d(d, h, w, D, H, W)) {
-      data[d * sD + h * sH + w * sW] += delta;
-    }
-  }
-
-  template<typename scalar_t>
-  Tensor grid_sampler2d_cpu_impl(const Tensor& input, const Tensor& grid,
-                                 GridSamplerInterpolation interpolation_mode,
-                                 GridSamplerPadding padding_mode) {
-    int64_t N = input.size(0);
-    int64_t C = input.size(1);
-    int64_t inp_H = input.size(2);
-    int64_t inp_W = input.size(3);
-    int64_t out_H = grid.size(1);
-    int64_t out_W = grid.size(2);
-    auto output = at::empty({N, C, out_H, out_W}, input.options());
-    int64_t inp_sN = input.stride(0);
-    int64_t inp_sC = input.stride(1);
-    int64_t inp_sH = input.stride(2);
-    int64_t inp_sW = input.stride(3);
-    int64_t grid_sN = grid.stride(0);
-    int64_t grid_sH = grid.stride(1);
-    int64_t grid_sW = grid.stride(2);
-    int64_t grid_sCoor = grid.stride(3);
-    int64_t out_sN = output.stride(0);
-    int64_t out_sC = output.stride(1);
-    int64_t out_sH = output.stride(2);
-    int64_t out_sW = output.stride(3);
-    scalar_t *inp_ptr = input.data<scalar_t>();
-    scalar_t *out_ptr = output.data<scalar_t>();
-    scalar_t *grid_ptr = grid.data<scalar_t>();
-    // loop over each output pixel
-    #ifdef _OPENMP
-    #pragma omp parallel for
-    #endif
-    for (int64_t n = 0; n < N; ++n) {
-      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-      for (int64_t h = 0; h < out_H; ++h) {
-        for (int64_t w = 0; w < out_W; ++w) {
-          // get the corresponding input x, y co-ordinates from grid
-          scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW];
-          scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor];
-
-          // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1]
-          ix = ((ix + 1) / 2) * (inp_W - 1);
-          iy = ((iy + 1) / 2) * (inp_H - 1);
-
-          // get NE, NW, SE, SW pixel values from (x, y)
-          int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
-          int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
-          int64_t ix_ne = ix_nw + 1;
-          int64_t iy_ne = iy_nw;
-          int64_t ix_sw = ix_nw;
-          int64_t iy_sw = iy_nw + 1;
-          int64_t ix_se = ix_nw + 1;
-          int64_t iy_se = iy_nw + 1;
-
-          // get surfaces to each neighbor:
-          scalar_t nw = (ix_se - ix)    * (iy_se - iy);
-          scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
-          scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
-          scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
-
-          if (padding_mode == GridSamplerPadding::Border) {
-            // clip coordinates to image borders
-            ix_nw = clip_coordinates(ix_nw, inp_W);
-            iy_nw = clip_coordinates(iy_nw, inp_H);
-            ix_ne = clip_coordinates(ix_ne, inp_W);
-            iy_ne = clip_coordinates(iy_ne, inp_H);
-            ix_sw = clip_coordinates(ix_sw, inp_W);
-            iy_sw = clip_coordinates(iy_sw, inp_H);
-            ix_se = clip_coordinates(ix_se, inp_W);
-            iy_se = clip_coordinates(iy_se, inp_H);
-          }
-
-          // calculate bilinear weighted pixel value and set output pixel
-          scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-          scalar_t *inp_ptr_NC = inp_ptr_N;
-          for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-            //   (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne
-            // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se
-            *out_ptr_NCHW = static_cast<scalar_t>(0);
-            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-              *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-            }
-            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-              *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-            }
-            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-              *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-            }
-            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-              *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
-            }
-          }
-        }
-      }
-    }
-    return output;
-  }
-
-  template<typename scalar_t>
-  Tensor grid_sampler3d_cpu_impl(const Tensor& input, const Tensor& grid,
-                                 GridSamplerInterpolation interpolation_mode,
-                                 GridSamplerPadding padding_mode) {
-    int64_t N = input.size(0);
-    int64_t C = input.size(1);
-    int64_t inp_D = input.size(2);
-    int64_t inp_H = input.size(3);
-    int64_t inp_W = input.size(4);
-    int64_t out_D = grid.size(1);
-    int64_t out_H = grid.size(2);
-    int64_t out_W = grid.size(3);
-    auto output = at::empty({N, C, out_D, out_H, out_W}, input.options());
-    int64_t inp_sN = input.stride(0);
-    int64_t inp_sC = input.stride(1);
-    int64_t inp_sD = input.stride(2);
-    int64_t inp_sH = input.stride(3);
-    int64_t inp_sW = input.stride(4);
-    int64_t grid_sN = grid.stride(0);
-    int64_t grid_sD = grid.stride(1);
-    int64_t grid_sH = grid.stride(2);
-    int64_t grid_sW = grid.stride(3);
-    int64_t grid_sCoor = grid.stride(4);
-    int64_t out_sN = output.stride(0);
-    int64_t out_sC = output.stride(1);
-    int64_t out_sD = output.stride(2);
-    int64_t out_sH = output.stride(3);
-    int64_t out_sW = output.stride(4);
-    scalar_t *inp_ptr = input.data<scalar_t>();
-    scalar_t *out_ptr = output.data<scalar_t>();
-    scalar_t *grid_ptr = grid.data<scalar_t>();
-    // loop over each output pixel
-    #ifdef _OPENMP
-    #pragma omp parallel for
-    #endif
-    for (int64_t n = 0; n < N; ++n) {
-      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-      for (int64_t d = 0; d < out_D; ++d) {
-        for (int64_t h = 0; h < out_H; ++h) {
-          for (int64_t w = 0; w < out_W; ++w) {
-            // get the corresponding input x, y, z co-ordinates from grid
-            scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
-            scalar_t ix = *grid_ptr_NDHW;
-            scalar_t iy = grid_ptr_NDHW[grid_sCoor];
-            scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor];
-
-            // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1]
-            ix = ((ix + 1) / 2) * (inp_W - 1);
-            iy = ((iy + 1) / 2) * (inp_H - 1);
-            iz = ((iz + 1) / 2) * (inp_D - 1);
-
-            // get corner pixel values from (x, y, z)
-            // for 4d, we used north-east-south-west
-            // for 5d, we add top-bottom
-            int64_t ix_tnw = static_cast<int64_t>(std::floor(ix));
-            int64_t iy_tnw = static_cast<int64_t>(std::floor(iy));
-            int64_t iz_tnw = static_cast<int64_t>(std::floor(iz));
-
-            int64_t ix_tne = ix_tnw + 1;
-            int64_t iy_tne = iy_tnw;
-            int64_t iz_tne = iz_tnw;
-
-            int64_t ix_tsw = ix_tnw;
-            int64_t iy_tsw = iy_tnw + 1;
-            int64_t iz_tsw = iz_tnw;
-
-            int64_t ix_tse = ix_tnw + 1;
-            int64_t iy_tse = iy_tnw + 1;
-            int64_t iz_tse = iz_tnw;
-
-            int64_t ix_bnw = ix_tnw;
-            int64_t iy_bnw = iy_tnw;
-            int64_t iz_bnw = iz_tnw + 1;
-
-            int64_t ix_bne = ix_tnw + 1;
-            int64_t iy_bne = iy_tnw;
-            int64_t iz_bne = iz_tnw + 1;
-
-            int64_t ix_bsw = ix_tnw;
-            int64_t iy_bsw = iy_tnw + 1;
-            int64_t iz_bsw = iz_tnw + 1;
-
-            int64_t ix_bse = ix_tnw + 1;
-            int64_t iy_bse = iy_tnw + 1;
-            int64_t iz_bse = iz_tnw + 1;
-
-            // get surfaces to each neighbor:
-            scalar_t tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
-            scalar_t tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
-            scalar_t tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
-            scalar_t tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
-            scalar_t bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
-            scalar_t bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
-            scalar_t bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
-            scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
-
-            if (padding_mode == GridSamplerPadding::Border) {
-              // clip coordinates to image borders
-              ix_tnw = clip_coordinates(ix_tnw, inp_W);
-              iy_tnw = clip_coordinates(iy_tnw, inp_H);
-              iz_tnw = clip_coordinates(iz_tnw, inp_D);
-              ix_tne = clip_coordinates(ix_tne, inp_W);
-              iy_tne = clip_coordinates(iy_tne, inp_H);
-              iz_tne = clip_coordinates(iz_tne, inp_D);
-              ix_tsw = clip_coordinates(ix_tsw, inp_W);
-              iy_tsw = clip_coordinates(iy_tsw, inp_H);
-              iz_tsw = clip_coordinates(iz_tsw, inp_D);
-              ix_tse = clip_coordinates(ix_tse, inp_W);
-              iy_tse = clip_coordinates(iy_tse, inp_H);
-              iz_tse = clip_coordinates(iz_tse, inp_D);
-              ix_bnw = clip_coordinates(ix_bnw, inp_W);
-              iy_bnw = clip_coordinates(iy_bnw, inp_H);
-              iz_bnw = clip_coordinates(iz_bnw, inp_D);
-              ix_bne = clip_coordinates(ix_bne, inp_W);
-              iy_bne = clip_coordinates(iy_bne, inp_H);
-              iz_bne = clip_coordinates(iz_bne, inp_D);
-              ix_bsw = clip_coordinates(ix_bsw, inp_W);
-              iy_bsw = clip_coordinates(iy_bsw, inp_H);
-              iz_bsw = clip_coordinates(iz_bsw, inp_D);
-              ix_bse = clip_coordinates(ix_bse, inp_W);
-              iy_bse = clip_coordinates(iy_bse, inp_H);
-              iz_bse = clip_coordinates(iz_bse, inp_D);
-            }
-
-            // calculate bilinear weighted pixel value and set output pixel
-            scalar_t *out_ptr_NCDHW = out_ptr + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
-            scalar_t *inp_ptr_NC = inp_ptr_N;
-            for (int c = 0; c < C; ++c, out_ptr_NCDHW += out_sC, inp_ptr_NC += inp_sC) {
-              //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne
-              // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse
-              // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne
-              // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse
-              *out_ptr_NCDHW = static_cast<scalar_t>(0);
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
-                *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
-              }
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
-                *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
-              }
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
-                *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
-              }
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
-                *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
-              }
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
-                *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
-              }
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
-                *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
-              }
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
-                *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
-              }
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
-                *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
-              }
-            }
-          }
-        }
-      }
-    }
-    return output;
-  }
-
-  template<typename scalar_t>
-  std::tuple<Tensor, Tensor>
-  grid_sampler2d_backward_cpu_impl(const Tensor& grad_output,
-                                   const Tensor& input, const Tensor& grid,
-                                   GridSamplerInterpolation interpolation_mode,
-                                   GridSamplerPadding padding_mode) {
-    auto grad_input = at::zeros_like(input);
-    auto grad_grid = at::empty_like(grid);
-    int64_t N = input.size(0);
-    int64_t C = input.size(1);
-    int64_t inp_H = input.size(2);
-    int64_t inp_W = input.size(3);
-    int64_t out_H = grid.size(1);
-    int64_t out_W = grid.size(2);
-    int64_t inp_sN = input.stride(0);
-    int64_t inp_sC = input.stride(1);
-    int64_t inp_sH = input.stride(2);
-    int64_t inp_sW = input.stride(3);
-    int64_t grid_sN = grid.stride(0);
-    int64_t grid_sH = grid.stride(1);
-    int64_t grid_sW = grid.stride(2);
-    int64_t grid_sCoor = grid.stride(3);
-    int64_t gOut_sN = grad_output.stride(0);
-    int64_t gOut_sC = grad_output.stride(1);
-    int64_t gOut_sH = grad_output.stride(2);
-    int64_t gOut_sW = grad_output.stride(3);
-    int64_t gInp_sN = grad_input.stride(0);
-    int64_t gInp_sC = grad_input.stride(1);
-    int64_t gInp_sH = grad_input.stride(2);
-    int64_t gInp_sW = grad_input.stride(3);
-    int64_t gGrid_sN = grad_grid.stride(0);
-    int64_t gGrid_sW = grad_grid.stride(2);
-    scalar_t *inp_ptr = input.data<scalar_t>();
-    scalar_t *grid_ptr = grid.data<scalar_t>();
-    scalar_t *gOut_ptr = grad_output.data<scalar_t>();
-    scalar_t *gInp_ptr = grad_input.data<scalar_t>();
-    scalar_t *gGrid_ptr = grad_grid.data<scalar_t>();
-    // loop over each output pixel
-    #ifdef _OPENMP
-    #pragma omp parallel for
-    #endif
-    for (int64_t n = 0; n < N; ++n) {
-      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-      scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN;
-      for (int64_t h = 0; h < out_H; ++h) {
-        for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) {
-          // get the corresponding input x, y co-ordinates from grid
-          scalar_t ix = grid_ptr_N[h * grid_sH + w * grid_sW];
-          scalar_t iy = grid_ptr_N[h * grid_sH + w * grid_sW + grid_sCoor];
-
-          // normalize ix, iy from [-1, 1] to [0, inp_W-1] & [0, inp_H-1]
-          ix = ((ix + 1) / 2) * (inp_W - 1);
-          iy = ((iy + 1) / 2) * (inp_H - 1);
-
-          // get NE, NW, SE, SW pixel values from (x, y)
-          int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
-          int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
-          int64_t ix_ne = ix_nw + 1;
-          int64_t iy_ne = iy_nw;
-          int64_t ix_sw = ix_nw;
-          int64_t iy_sw = iy_nw + 1;
-          int64_t ix_se = ix_nw + 1;
-          int64_t iy_se = iy_nw + 1;
-
-          // get surfaces to each neighbor:
-          scalar_t nw = (ix_se - ix)    * (iy_se - iy);
-          scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
-          scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
-          scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
-
-          int64_t ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl;
-
-          if (padding_mode == GridSamplerPadding::Border) {
-            // get clipped NE, NW, SE, SW pixel values from (x, y)
-            ix_nw_cl = clip_coordinates(ix_nw, inp_W);
-            iy_nw_cl = clip_coordinates(iy_nw, inp_H);
-            ix_ne_cl = clip_coordinates(ix_ne, inp_W);
-            iy_ne_cl = clip_coordinates(iy_ne, inp_H);
-            ix_sw_cl = clip_coordinates(ix_sw, inp_W);
-            iy_sw_cl = clip_coordinates(iy_sw, inp_H);
-            ix_se_cl = clip_coordinates(ix_se, inp_W);
-            iy_se_cl = clip_coordinates(iy_se, inp_H);
-          } else {
-            ix_nw_cl = ix_nw;
-            iy_nw_cl = iy_nw;
-            ix_ne_cl = ix_ne;
-            iy_ne_cl = iy_ne;
-            ix_sw_cl = ix_sw;
-            iy_sw_cl = iy_sw;
-            ix_se_cl = ix_se;
-            iy_se_cl = iy_se;
-          }
-
-          scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0);
-          scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
-          scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
-          scalar_t *inp_ptr_NC = inp_ptr_N;
-          // calculate bilinear weighted pixel value and set output pixel
-          for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
-            scalar_t gOut = *gOut_ptr_NCHW;
-
-            // calculate and set grad_input
-            safe_add_2d(gInp_ptr_NC, iy_nw_cl, ix_nw_cl, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut);
-            safe_add_2d(gInp_ptr_NC, iy_ne_cl, ix_ne_cl, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut);
-            safe_add_2d(gInp_ptr_NC, iy_sw_cl, ix_sw_cl, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut);
-            safe_add_2d(gInp_ptr_NC, iy_se_cl, ix_se_cl, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut);
-
-            // calculate grad_grid
-            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw_cl, ix_nw_cl, inp_H, inp_W)) {
-              scalar_t nw_val = inp_ptr_NC[iy_nw_cl * inp_sH + ix_nw_cl * inp_sW];
-              gix -= nw_val * (iy_se - iy) * gOut;
-              giy -= nw_val * (ix_se - ix) * gOut;
-            }
-            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne_cl, ix_ne_cl, inp_H, inp_W)) {
-              scalar_t ne_val = inp_ptr_NC[iy_ne_cl * inp_sH + ix_ne_cl * inp_sW];
-              gix += ne_val * (iy_sw - iy) * gOut;
-              giy -= ne_val * (ix - ix_sw) * gOut;
-            }
-            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw_cl, ix_sw_cl, inp_H, inp_W)) {
-              scalar_t sw_val = inp_ptr_NC[iy_sw_cl * inp_sH + ix_sw_cl * inp_sW];
-              gix -= sw_val * (iy - iy_ne) * gOut;
-              giy += sw_val * (ix_ne - ix) * gOut;
-            }
-            if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se_cl, ix_se_cl, inp_H, inp_W)) {
-              scalar_t se_val = inp_ptr_NC[iy_se_cl * inp_sH + ix_se_cl * inp_sW];
-              gix += se_val * (iy - iy_nw) * gOut;
-              giy += se_val * (ix - ix_nw) * gOut;
-            }
-          }
-
-          // un-normalize grad_grid values back to [-1, 1] constraints
-          gix = gix * (inp_W - 1) / 2;
-          giy = giy * (inp_H - 1) / 2;
-
-          // assuming grad_grid is contiguous
-          gGrid_ptr_NHW[0] = gix;
-          gGrid_ptr_NHW[1] = giy;
-        }
-      }
-    }
-    return std::make_tuple(grad_input, grad_grid);
-  }
-
-  template<typename scalar_t>
-  std::tuple<Tensor, Tensor>
-  grid_sampler3d_backward_cpu_impl(const Tensor& grad_output,
-                                   const Tensor& input, const Tensor& grid,
-                                   GridSamplerInterpolation interpolation_mode,
-                                   GridSamplerPadding padding_mode) {
-    auto grad_input = at::zeros_like(input);
-    auto grad_grid = at::empty_like(grid);
-    int64_t N = input.size(0);
-    int64_t C = input.size(1);
-    int64_t inp_D = input.size(2);
-    int64_t inp_H = input.size(3);
-    int64_t inp_W = input.size(4);
-    int64_t out_D = grid.size(1);
-    int64_t out_H = grid.size(2);
-    int64_t out_W = grid.size(3);
-    int64_t inp_sN = input.stride(0);
-    int64_t inp_sC = input.stride(1);
-    int64_t inp_sD = input.stride(2);
-    int64_t inp_sH = input.stride(3);
-    int64_t inp_sW = input.stride(4);
-    int64_t grid_sN = grid.stride(0);
-    int64_t grid_sD = grid.stride(1);
-    int64_t grid_sH = grid.stride(2);
-    int64_t grid_sW = grid.stride(3);
-    int64_t grid_sCoor = grid.stride(4);
-    int64_t gOut_sN = grad_output.stride(0);
-    int64_t gOut_sC = grad_output.stride(1);
-    int64_t gOut_sD = grad_output.stride(2);
-    int64_t gOut_sH = grad_output.stride(3);
-    int64_t gOut_sW = grad_output.stride(4);
-    int64_t gInp_sN = grad_input.stride(0);
-    int64_t gInp_sC = grad_input.stride(1);
-    int64_t gInp_sD = grad_input.stride(2);
-    int64_t gInp_sH = grad_input.stride(3);
-    int64_t gInp_sW = grad_input.stride(4);
-    int64_t gGrid_sN = grad_grid.stride(0);
-    int64_t gGrid_sW = grad_grid.stride(3);
-    scalar_t *inp_ptr = input.data<scalar_t>();
-    scalar_t *grid_ptr = grid.data<scalar_t>();
-    scalar_t *gOut_ptr = grad_output.data<scalar_t>();
-    scalar_t *gInp_ptr = grad_input.data<scalar_t>();
-    scalar_t *gGrid_ptr = grad_grid.data<scalar_t>();
-    // loop over each output pixel
-    #ifdef _OPENMP
-    #pragma omp parallel for
-    #endif
-    for (int64_t n = 0; n < N; ++n) {
-      scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-      scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-      scalar_t *gGrid_ptr_NDHW = gGrid_ptr + n * gGrid_sN;
-      for (int64_t d = 0; d < out_D; ++d) {
-        for (int64_t h = 0; h < out_H; ++h) {
-          for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) {
-            // get the corresponding input x, y, z co-ordinates from grid
-            scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
-            scalar_t ix = *grid_ptr_NDHW;
-            scalar_t iy = grid_ptr_NDHW[grid_sCoor];
-            scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor];
-
-            // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1]
-            ix = ((ix + 1) / 2) * (inp_W - 1);
-            iy = ((iy + 1) / 2) * (inp_H - 1);
-            iz = ((iz + 1) / 2) * (inp_D - 1);
-
-            // get corner pixel values from (x, y, z)
-            // for 4d, we used north-east-south-west
-            // for 5d, we add top-bottom
-            int64_t ix_tnw = static_cast<int64_t>(std::floor(ix));
-            int64_t iy_tnw = static_cast<int64_t>(std::floor(iy));
-            int64_t iz_tnw = static_cast<int64_t>(std::floor(iz));
-
-            int64_t ix_tne = ix_tnw + 1;
-            int64_t iy_tne = iy_tnw;
-            int64_t iz_tne = iz_tnw;
-
-            int64_t ix_tsw = ix_tnw;
-            int64_t iy_tsw = iy_tnw + 1;
-            int64_t iz_tsw = iz_tnw;
-
-            int64_t ix_tse = ix_tnw + 1;
-            int64_t iy_tse = iy_tnw + 1;
-            int64_t iz_tse = iz_tnw;
-
-            int64_t ix_bnw = ix_tnw;
-            int64_t iy_bnw = iy_tnw;
-            int64_t iz_bnw = iz_tnw + 1;
-
-            int64_t ix_bne = ix_tnw + 1;
-            int64_t iy_bne = iy_tnw;
-            int64_t iz_bne = iz_tnw + 1;
-
-            int64_t ix_bsw = ix_tnw;
-            int64_t iy_bsw = iy_tnw + 1;
-            int64_t iz_bsw = iz_tnw + 1;
-
-            int64_t ix_bse = ix_tnw + 1;
-            int64_t iy_bse = iy_tnw + 1;
-            int64_t iz_bse = iz_tnw + 1;
-
-            // get surfaces to each neighbor:
-            scalar_t tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
-            scalar_t tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
-            scalar_t tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
-            scalar_t tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
-            scalar_t bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
-            scalar_t bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
-            scalar_t bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
-            scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
-
-            int64_t ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl;
-            int64_t ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl;
-            int64_t ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl;
-            int64_t ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl;
-
-            if (padding_mode == GridSamplerPadding::Border) {
-              // clip coordinates to image borders
-              ix_tnw_cl = clip_coordinates(ix_tnw, inp_W);
-              iy_tnw_cl = clip_coordinates(iy_tnw, inp_H);
-              iz_tnw_cl = clip_coordinates(iz_tnw, inp_D);
-              ix_tne_cl = clip_coordinates(ix_tne, inp_W);
-              iy_tne_cl = clip_coordinates(iy_tne, inp_H);
-              iz_tne_cl = clip_coordinates(iz_tne, inp_D);
-              ix_tsw_cl = clip_coordinates(ix_tsw, inp_W);
-              iy_tsw_cl = clip_coordinates(iy_tsw, inp_H);
-              iz_tsw_cl = clip_coordinates(iz_tsw, inp_D);
-              ix_tse_cl = clip_coordinates(ix_tse, inp_W);
-              iy_tse_cl = clip_coordinates(iy_tse, inp_H);
-              iz_tse_cl = clip_coordinates(iz_tse, inp_D);
-              ix_bnw_cl = clip_coordinates(ix_bnw, inp_W);
-              iy_bnw_cl = clip_coordinates(iy_bnw, inp_H);
-              iz_bnw_cl = clip_coordinates(iz_bnw, inp_D);
-              ix_bne_cl = clip_coordinates(ix_bne, inp_W);
-              iy_bne_cl = clip_coordinates(iy_bne, inp_H);
-              iz_bne_cl = clip_coordinates(iz_bne, inp_D);
-              ix_bsw_cl = clip_coordinates(ix_bsw, inp_W);
-              iy_bsw_cl = clip_coordinates(iy_bsw, inp_H);
-              iz_bsw_cl = clip_coordinates(iz_bsw, inp_D);
-              ix_bse_cl = clip_coordinates(ix_bse, inp_W);
-              iy_bse_cl = clip_coordinates(iy_bse, inp_H);
-              iz_bse_cl = clip_coordinates(iz_bse, inp_D);
-            } else {
-              ix_tnw_cl = ix_tnw;
-              iy_tnw_cl = iy_tnw;
-              iz_tnw_cl = iz_tnw;
-              ix_tne_cl = ix_tne;
-              iy_tne_cl = iy_tne;
-              iz_tne_cl = iz_tne;
-              ix_tsw_cl = ix_tsw;
-              iy_tsw_cl = iy_tsw;
-              iz_tsw_cl = iz_tsw;
-              ix_tse_cl = ix_tse;
-              iy_tse_cl = iy_tse;
-              iz_tse_cl = iz_tse;
-              ix_bnw_cl = ix_bnw;
-              iy_bnw_cl = iy_bnw;
-              iz_bnw_cl = iz_bnw;
-              ix_bne_cl = ix_bne;
-              iy_bne_cl = iy_bne;
-              iz_bne_cl = iz_bne;
-              ix_bsw_cl = ix_bsw;
-              iy_bsw_cl = iy_bsw;
-              iz_bsw_cl = iz_bsw;
-              ix_bse_cl = ix_bse;
-              iy_bse_cl = iy_bse;
-              iz_bse_cl = iz_bse;
-            }
-
-            scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0), giz = static_cast<scalar_t>(0);
-            scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
-            scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
-            scalar_t *inp_ptr_NC = inp_ptr_N;
-            // calculate bilinear weighted pixel value and set output pixel
-            for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
-              scalar_t gOut = *gOut_ptr_NCDHW;
-
-              // calculate and set grad_input
-              safe_add_3d(gInp_ptr_NC, iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut);
-              safe_add_3d(gInp_ptr_NC, iz_tne_cl, iy_tne_cl, ix_tne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut);
-              safe_add_3d(gInp_ptr_NC, iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut);
-              safe_add_3d(gInp_ptr_NC, iz_tse_cl, iy_tse_cl, ix_tse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut);
-              safe_add_3d(gInp_ptr_NC, iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut);
-              safe_add_3d(gInp_ptr_NC, iz_bne_cl, iy_bne_cl, ix_bne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut);
-              safe_add_3d(gInp_ptr_NC, iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut);
-              safe_add_3d(gInp_ptr_NC, iz_bse_cl, iy_bse_cl, ix_bse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut);
-
-              // calculate grad_grid
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, inp_D, inp_H, inp_W)) {
-                scalar_t tnw_val = inp_ptr_NC[iz_tnw_cl * inp_sD + iy_tnw_cl * inp_sH + ix_tnw_cl * inp_sW];
-                gix -= tnw_val * (iy_bse - iy)    * (iz_bse - iz)    * gOut;
-                giy -= tnw_val * (ix_bse - ix)    * (iz_bse - iz)    * gOut;
-                giz -= tnw_val * (ix_bse - ix)    * (iy_bse - iy)    * gOut;
-              }
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne_cl, iy_tne_cl, ix_tne_cl, inp_D, inp_H, inp_W)) {
-                scalar_t tne_val = inp_ptr_NC[iz_tne_cl * inp_sD + iy_tne_cl * inp_sH + ix_tne_cl * inp_sW];
-                gix += tne_val * (iy_bsw - iy)    * (iz_bsw - iz)    * gOut;
-                giy -= tne_val * (ix    - ix_bsw) * (iz_bsw - iz)    * gOut;
-                giz -= tne_val * (ix    - ix_bsw) * (iy_bsw - iy)    * gOut;
-              }
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, inp_D, inp_H, inp_W)) {
-                scalar_t tsw_val = inp_ptr_NC[iz_tsw_cl * inp_sD + iy_tsw_cl * inp_sH + ix_tsw_cl * inp_sW];
-                gix -= tsw_val * (iy - iy_bne)    * (iz_bne - iz)    * gOut;
-                giy += tsw_val * (ix_bne - ix)    * (iz_bne - iz)    * gOut;
-                giz -= tsw_val * (ix_bne - ix)    * (iy    - iy_bne) * gOut;
-              }
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse_cl, iy_tse_cl, ix_tse_cl, inp_D, inp_H, inp_W)) {
-                scalar_t tse_val = inp_ptr_NC[iz_tse_cl * inp_sD + iy_tse_cl * inp_sH + ix_tse_cl * inp_sW];
-                gix += tse_val * (iy - iy_bnw)    * (iz_bnw - iz)    * gOut;
-                giy += tse_val * (ix    - ix_bnw) * (iz_bnw - iz)    * gOut;
-                giz -= tse_val * (ix    - ix_bnw) * (iy    - iy_bnw) * gOut;
-              }
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, inp_D, inp_H, inp_W)) {
-                scalar_t bnw_val = inp_ptr_NC[iz_bnw_cl * inp_sD + iy_bnw_cl * inp_sH + ix_bnw_cl * inp_sW];
-                gix -= bnw_val * (iy_tse - iy)    * (iz - iz_tse)    * gOut;
-                giy -= bnw_val * (ix_tse - ix)    * (iz - iz_tse)    * gOut;
-                giz += bnw_val * (ix_tse - ix)    * (iy_tse - iy)    * gOut;
-              }
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne_cl, iy_bne_cl, ix_bne_cl, inp_D, inp_H, inp_W)) {
-                scalar_t bne_val = inp_ptr_NC[iz_bne_cl * inp_sD + iy_bne_cl * inp_sH + ix_bne_cl * inp_sW];
-                gix += bne_val * (iy_tsw - iy)    * (iz - iz_tsw)    * gOut;
-                giy -= bne_val * (ix    - ix_tsw) * (iz - iz_tsw)    * gOut;
-                giz += bne_val * (ix    - ix_tsw) * (iy_tsw - iy)    * gOut;
-              }
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, inp_D, inp_H, inp_W)) {
-                scalar_t bsw_val = inp_ptr_NC[iz_bsw_cl * inp_sD + iy_bsw_cl * inp_sH + ix_bsw_cl * inp_sW];
-                gix -= bsw_val * (iy - iy_tne)    * (iz - iz_tne)    * gOut;
-                giy += bsw_val * (ix_tne - ix)    * (iz - iz_tne)    * gOut;
-                giz += bsw_val * (ix_tne - ix)    * (iy    - iy_tne) * gOut;
-              }
-              if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse_cl, iy_bse_cl, ix_bse_cl, inp_D, inp_H, inp_W)) {
-                scalar_t bse_val = inp_ptr_NC[iz_bse_cl * inp_sD + iy_bse_cl * inp_sH + ix_bse_cl * inp_sW];
-                gix += bse_val * (iy - iy_tnw)    * (iz - iz_tnw)    * gOut;
-                giy += bse_val * (ix    - ix_tnw) * (iz - iz_tnw)    * gOut;
-                giz += bse_val * (ix    - ix_tnw) * (iy    - iy_tnw) * gOut;
-              }
-            }
-
-            // un-normalize grad_grid values back to [-1, 1] constraints
-            gix = gix * (inp_W - 1) / 2;
-            giy = giy * (inp_H - 1) / 2;
-            giz = giz * (inp_D - 1) / 2;
-
-            // assuming grad_grid is contiguous
-            gGrid_ptr_NDHW[0] = gix;
-            gGrid_ptr_NDHW[1] = giy;
-            gGrid_ptr_NDHW[2] = giz;
-          }
-        }
-      }
-    }
-    return std::make_tuple(grad_input, grad_grid);
-  }
-}
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid,
-                           int64_t interpolation_mode, int64_t padding_mode) {
-  return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler2d_cpu", [&] {
-    return grid_sampler2d_cpu_impl<scalar_t>(
-      input, grid, static_cast<GridSamplerInterpolation>(interpolation_mode),
-      static_cast<GridSamplerPadding>(padding_mode));
-  });
-}
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_3d_cpu(const Tensor& input, const Tensor& grid,
-                           int64_t interpolation_mode, int64_t padding_mode) {
-  return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler3d_cpu", [&] {
-    return grid_sampler3d_cpu_impl<scalar_t>(
-      input, grid, static_cast<GridSamplerInterpolation>(interpolation_mode),
-      static_cast<GridSamplerPadding>(padding_mode));
-  });
-}
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
-                             int64_t interpolation_mode, int64_t padding_mode) {
-  return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_2d_backward_cpu", [&] {
-    return grid_sampler2d_backward_cpu_impl<scalar_t>(
-      grad_output, input, grid,
-      static_cast<GridSamplerInterpolation>(interpolation_mode),
-      static_cast<GridSamplerPadding>(padding_mode));
-  });
-}
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
-                             int64_t interpolation_mode, int64_t padding_mode) {
-  return AT_DISPATCH_FLOATING_TYPES(input.type(), "grid_sampler_3d_backward_cpu", [&] {
-    return grid_sampler3d_backward_cpu_impl<scalar_t>(
-      grad_output, input, grid,
-      static_cast<GridSamplerInterpolation>(interpolation_mode),
-      static_cast<GridSamplerPadding>(padding_mode));
-  });
-}
-
-Tensor grid_sampler(const Tensor& input, const Tensor& grid, int64_t padding_mode) {
-  AT_CHECK(
-    (input.dim() == 4 || input.dim() == 5) && input.dim() == grid.dim(),
-    "grid_sampler(): expected 4D or 5D input and grid with same number "
-    "dimensions, but got input with sizes ", input.sizes(),
-    " and grid with sizes ", grid.sizes());
-  AT_CHECK(
-    input.size(0) == grid.size(0),
-    "grid_sampler(): expected grid and input to have same batch size, but got "
-    "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes());
-  AT_CHECK(
-    grid.size(-1) == input.dim() - 2,
-    "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last "
-    "dimension, but got grid with sizes ", grid.sizes());
-  // cudnn does not support inputs larger than 1024
-  if (at::native::cudnn_is_acceptable(input) &&
-      static_cast<GridSamplerPadding>(padding_mode) == GridSamplerPadding::Zeros &&
-      input.dim() == 4 &&
-      input.size(1) <= 1024) {
-    return cudnn_grid_sampler(input, grid);
-  }
-  if (input.dim() == 4) {
-    return at::grid_sampler_2d(input, grid, 0, padding_mode);
-  } else {
-    return at::grid_sampler_3d(input, grid, 0, padding_mode);
-  }
-}
-
-}}  // namespace at::native
diff --git a/aten/src/ATen/native/GridSampler.h b/aten/src/ATen/native/GridSampler.h
deleted file mode 100644
index f39b4e996469fa..00000000000000
--- a/aten/src/ATen/native/GridSampler.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "ATen/ATen.h"
-#include "ATen/NativeFunctions.h"
-
-namespace at { namespace native { namespace detail {
-
-  enum class GridSamplerInterpolation {Bilinear, Nearest};
-  enum class GridSamplerPadding {Zeros, Border, Reflection};
-
-}}}  // namespace at::native::detail
diff --git a/aten/src/ATen/native/Indexing.cpp b/aten/src/ATen/native/Indexing.cpp
index e4eb336cd5f453..9720adb4895769 100644
--- a/aten/src/ATen/native/Indexing.cpp
+++ b/aten/src/ATen/native/Indexing.cpp
@@ -69,7 +69,11 @@ static std::vector<Tensor> expandByteTensors(const Tensor & self, TensorList ind
       }
       // Replace with nonzeros
       auto nonzero = index.nonzero();
+#ifndef USE_TH_SIZE_ZERO_DIM
+      auto special_empty = nonzero.numel() == 0;
+#else
       auto special_empty = false;
+#endif
       for (int64_t j = 0; j < index.dim(); j++) {
         if (special_empty) {
           // We can't call select on an empty tensor so we just create an empty
@@ -210,10 +214,26 @@ static Tensor computeLinearIndex(const Tensor & src, TensorList indices) {
   return linearIndex;
 }
 
+#ifndef USE_TH_SIZE_ZERO_DIM
+static bool hasEmptyTensor(TensorList tensors) {
+  for (auto& tensor : tensors) {
+    if (tensor.defined() && tensor.numel() == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+#endif
+
 static std::tuple<Tensor, Tensor> makeLinearIndex(Tensor self, TensorList orig) {
   checkIndexTensorTypes(orig);
   // first expand ByteTensor (boolean masks) into 1 or more LongTensors
   auto indices = expandByteTensors(self, orig);
+#ifndef USE_TH_SIZE_ZERO_DIM
+  if (hasEmptyTensor(indices)) {
+    return std::make_tuple(self, self.type().toScalarType(kLong).tensor());
+  }
+#endif
   // next broadcast all index tensors together
   indices = expand_outplace(indices);
   // add missing null Tensors so that it matches self.dim()
@@ -279,11 +299,11 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten
   }
 
   // Check that source and destination slices have the same size
-  auto selfSlicedSizes = self.sizes().vec();
+  auto selfSlicedSizes = std::vector<int64_t>(self.sizes());
   if (selfSlicedSizes.size() > 0) {
     selfSlicedSizes.erase(selfSlicedSizes.begin() + dim);
   }
-  auto sourceSlicedSizes = source.sizes().vec();
+  auto sourceSlicedSizes = std::vector<int64_t>(source.sizes());
   if (sourceSlicedSizes.size() > 0) {
     sourceSlicedSizes.erase(sourceSlicedSizes.begin() + dim);
   }
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index c82bf8ba0ae043..cb24e71119f9b1 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -1,7 +1,6 @@
 #include "ATen/ATen.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/WrapDimUtilsMulti.h"
-#include <cctype>
 
 namespace at { namespace native {
 
@@ -137,8 +136,6 @@ Tensor einsum(std::string eqn, TensorList tensors) {
   } else {
     in_eqn = eqn;
   }
-  // remove spaces for einsum compatibility (#9929)
-  in_eqn.erase(std::remove_if(in_eqn.begin(), in_eqn.end(), isspace), in_eqn.end());
 
   // next we parse in_eq (the left hand side) by iterating. It is a string of comma separated terms per index
   int64_t operand = 0;
@@ -215,7 +212,7 @@ Tensor einsum(std::string eqn, TensorList tensors) {
             num_output_dims++;
           }
         }
-      } else if (! isspace(c)) {                              // letter (hopefully)
+      } else {                              // letter (hopefully)
         AT_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side");
         AT_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices");
         int64_t letter_num = c-'a';
diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
deleted file mode 100644
index 092b7255eb4a0d..00000000000000
--- a/aten/src/ATen/native/LossCTC.cpp
+++ /dev/null
@@ -1,365 +0,0 @@
-// Copyright (c) 2018 MathInf GmbH, Thomas Viehmann
-// Licensed under the BSD-3-Clause license
-// This is the CPU implementation of the Connectionist Temporal Loss.
-// We mostly follow Graves.
-// 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf
-// We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based.
-// Graves et al call the probabilities y, we use log_probs (also calling them inputs)
-
-#include <ATen/ATen.h>
-#include "ATen/Dispatch.h"
-#include "ATen/TensorUtils.h"
-
-#include <numeric>
-#include <type_traits>
-
-namespace at {
-namespace native {
-
-namespace {
-
-// this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done
-template<typename target_t>
-static inline int64_t get_target_prime(target_t* target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) {
-  if (idx % 2 == 0) {
-    return BLANK;
-  } else {
-    return target[offset + stride * (idx / 2)];
-  }
-}
-
-// This kernel is a relatively straightforward implementation of the alpha calculation in the forward backward algorithm (section 4.1).
-// A (minor) twist is that we are using log-calculations to enhance numerical stability (log_probs and log_alpha).
-// The function returns the loss and the alphas, the alphas are kept for the backward step. The wrapper (ctc_loss below) hides
-// the alphas from the user by only returning the loss.
-template<typename scalar_t, ScalarType target_scalar_type>
-std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) {
-  // log_probs: input_len x batch_size x num_labels
-  // targets [int64]: batch_size x target_length OR sum(target_lengths)
-  constexpr scalar_t neginf = -std::numeric_limits<scalar_t>::infinity();
-  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
-
-  CheckedFrom c = "ctc_loss_cpu";
-  auto log_probs_arg = TensorArg(log_probs, "log_probs", 1);
-  auto targets_arg = TensorArg(targets, "targets", 2);
-  checkScalarType(c, targets_arg, target_scalar_type);
-  checkDim(c, log_probs_arg, 3);
-  checkDimRange(c, targets_arg, 1, 3);
-
-  int64_t batch_size = log_probs.size(1);
-  int64_t num_labels = log_probs.size(2);
-  AT_CHECK(BLANK < num_labels, "blank must be in label range");
-  AT_CHECK((int64_t) input_lengths.size() == batch_size, "input_lengths must be of size batch_size");
-  AT_CHECK((int64_t) target_lengths.size() == batch_size, "target_lengths must be of size batch_size");
-
-  size_t tg_target_stride;
-  int64_t max_target_length;
-  std::vector<int64_t> tg_batch_offsets(batch_size);
-  if (targets.dim() == 1) { // concatenated targets
-    int64_t pos = 0;
-    max_target_length = 0;
-    for (int64_t i = 0; i < batch_size; i++) {
-      tg_batch_offsets[i] = pos;
-      pos += target_lengths[i];
-      if (max_target_length < target_lengths[i])
-	max_target_length = target_lengths[i];
-    }
-    tg_target_stride = targets.stride(0);
-    checkSize(c, targets_arg, 0, pos);
-  }
-  else { // batch x max_target_length
-    // dim is 2
-    int64_t tg_batch_stride = targets.stride(0);
-    for (int64_t i = 0; i < batch_size; i++) {
-      tg_batch_offsets[i] = i * tg_batch_stride;
-    }
-    tg_target_stride = targets.stride(1);
-    max_target_length = targets.size(1);
-    checkSize(c, targets_arg, 0, batch_size);
-    AT_CHECK(targets.size(1) >= max_target_length,
-             "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg,
-             " (while checking arguments for ", c, ")");
-  }
-  int64_t max_input_length = log_probs.size(0);
-  for (int64_t b = 0; b < batch_size; b++) {
-    AT_CHECK(input_lengths[b] <= max_input_length,
-	     "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
-	     " (while checking arguments for ", c, ")");
-  }
-
-  Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
-  Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options());
-
-  auto lpp  = log_probs.permute({1,0,2});
-  auto log_probs_a_global = lpp.accessor<scalar_t, 3>();
-  auto log_alpha_a_global = log_alpha.accessor<scalar_t, 3>();
-  auto targets_data = targets.data<target_t>();
-  auto neg_log_likelihood_a = neg_log_likelihood.accessor<scalar_t, 1>();
-
-  // alpha calculation for the first row, the three equations for alpha_1 above eq (6)
-  // first the default
-  log_alpha.narrow(1, 0, 1).fill_(neginf);
-  #pragma omp parallel for
-  for (int64_t b = 0; b < batch_size; b++) {
-    int64_t input_length = input_lengths[b];
-    int64_t target_length = target_lengths[b];
-    auto log_probs_a = log_probs_a_global[b];
-    auto log_alpha_a = log_alpha_a_global[b];
-    int64_t tg_batch_offset = tg_batch_offsets[b];
-
-    // the first two items of alpha_t above eq (6)
-    log_alpha_a[0][0] = log_probs_a[0][BLANK];
-    if (target_length > 0)
-      log_alpha_a[0][1] = log_probs_a[0][get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)];
-
-    // now the loop over the inputs
-    for (int64_t t=1; t<input_length; t++) {
-      for (int64_t s=0; s<2*target_length+1; s++) {
-	auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
-	// this loop over s could be parallel/vectorized, too, but the required items are one index apart
-	// alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
-	// for the cuda implementation, that gave a speed boost.
-	// This is eq (6) and (7), la1,2,3 are the three summands. We keep track of the maximum for the logsumexp calculation.
-
-        scalar_t la1 = log_alpha_a[t-1][s];
-        scalar_t lamax = la1;
-        scalar_t la2, la3;
-        if (s > 0) {
-          la2 = log_alpha_a[t-1][s-1];
-          if (la2 > lamax)
-            lamax = la2;
-        } else {
-          la2 = neginf;
-        }
-        if ((s > 1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s-2, BLANK) !=
-                        current_target_prime)) {
-          la3 = log_alpha_a[t-1][s-2];
-          if (la3 > lamax)
-            lamax = la3;
-        } else {
-          la3 = neginf;
-        }
-        if (lamax == neginf) // cannot do neginf-neginf
-          lamax = 0;
-	// this is the assignment of eq (6)
-        log_alpha_a[t][s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax + log_probs_a[t][current_target_prime];
-      }
-    }
-    // the likelihood is the the sum of the last two alphas, eq (8), the loss is the negative log likelihood
-    scalar_t l1 = log_alpha_a[input_length-1][target_length*2];
-    scalar_t l2 = log_alpha_a[input_length-1][target_length*2-1];
-    scalar_t m = std::max(l1, l2);
-    m = ((m == neginf) ? 0 : m);
-    scalar_t log_likelihood = std::log(std::exp(l1-m)+std::exp(l2-m))+m;
-    neg_log_likelihood_a[b] = -log_likelihood;
-  }
-
-  return std::make_tuple(neg_log_likelihood, log_alpha);
-}
-
-// This is the backward. It consists of two phases:
-// a) computing the beta analogous to the alphas in the forward (backward half of the forward-backward algorithm) (eq (10) and (11))
-// b) collecting the per-activation characters for all s and wrapping the gradient (eq (16), the collection is the sum)
-template<typename scalar_t, ScalarType target_scalar_type>
-Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths,
-                                      const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) {
-  constexpr scalar_t neginf = -std::numeric_limits<scalar_t>::infinity();
-  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
-  int64_t max_input_length = log_probs.size(0);
-  int64_t batch_size = log_probs.size(1);
-  int64_t num_labels = log_probs.size(2);
-  Tensor grad = at::full_like(log_probs, neginf); // at this point, this is log of empty sum
-
-  // The admin bits. We don't do much checking and assume that the forward did.
-  int64_t tg_target_stride;
-  int64_t max_target_length;
-  std::vector<int64_t> tg_batch_offsets(batch_size);
-
-  if (targets.dim() == 1) { // concatenated targets
-    int64_t pos = 0;
-    max_target_length = 0;
-    for (int64_t i = 0; i < batch_size; i++) {
-      tg_batch_offsets[i] = pos;
-      pos += target_lengths[i];
-      if (max_target_length < target_lengths[i])
-	max_target_length = target_lengths[i];
-    }
-    tg_target_stride = targets.stride(0);
-  }
-  else { // batch x max_target_length
-    // dim is 2
-    int64_t tg_batch_stride = targets.stride(0);
-    for (int64_t i = 0; i < batch_size; i++) {
-      tg_batch_offsets[i] = i * tg_batch_stride;
-    }
-    tg_target_stride = targets.stride(1);
-    max_target_length = targets.size(1);
-  }
-
-  Tensor log_beta = at::empty_like(log_alpha);  // could be optimized to use only 2 rows
-  auto lpp  = log_probs.permute({1,0,2});
-  auto log_probs_a_global = lpp.accessor<scalar_t, 3>();
-  auto log_alpha_a_global = log_alpha.accessor<scalar_t, 3>();
-  auto log_beta_a_global = log_beta.accessor<scalar_t, 3>();
-  auto gp = grad.permute({1,0,2});
-  auto grad_a_global = gp.accessor<scalar_t, 3>();
-  auto targets_data = targets.data<target_t>();
-
-  #pragma omp parallel for
-  for (int64_t b = 0; b < batch_size; b++) {
-    auto log_probs_a = log_probs_a_global[b];
-    auto log_alpha_a = log_alpha_a_global[b];
-    auto log_beta_a = log_beta_a_global[b];
-    auto grad_a = grad_a_global[b];
-    int64_t input_length = input_lengths[b];
-    int64_t target_length = target_lengths[b];
-    int64_t tg_batch_offset = tg_batch_offsets[b];
-
-    // the initialization of beta before eq (10)
-    // here we do the fill for each batch item separately, as the input lengths will differ, so the t in which
-    // we start varies
-    if (input_length > 0) {
-      log_beta.narrow(0, b, 1).narrow(1, input_length-1, 1).fill_(neginf);
-      log_beta_a[input_length-1][2*target_length] = log_probs_a[input_length-1][BLANK];
-      grad_a[input_length-1][BLANK] = log_alpha_a[input_length-1][2*target_length] + log_beta_a[input_length-1][2*target_length];
-
-      if (target_length > 0) {
-        auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 2*target_length-1, BLANK);
-        log_beta_a[input_length-1][2*target_length-1] = log_probs_a[input_length-1][current_target_prime];
-
-        // the first two are a blank and a non-blank, so we know they are different and we don't need to do log+
-        grad_a[input_length-1][current_target_prime] = log_alpha_a[input_length-1][2*target_length-1] + log_beta_a[input_length-1][2*target_length-1];
-      }
-    }
-
-    // now loop applying eq (10) / (11)
-    for (int64_t t=input_length-2; t>=0; t--) {
-      // this loop over s could be parallel/vectorized and doesn't really need to be descending...
-      // alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
-      // for the cuda implementation, that gave a speed boost.
-      for (int64_t s=2*target_length; s>=0; s--) {
-        scalar_t lb1 = log_beta_a[t+1][s];
-        scalar_t lbmax = lb1;
-        scalar_t lb2, lb3;
-        auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
-        if (s < 2*target_length) {
-          lb2 = log_beta_a[t+1][s+1];
-          if (lb2 > lbmax)
-            lbmax = lb2;
-        } else {
-          lb2 = neginf;
-        }
-        if ((s < 2*target_length-1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) !=
-                                        current_target_prime)) {
-          lb3 = log_beta_a[t+1][s+2];
-          if (lb3 > lbmax)
-            lbmax = lb3;
-        } else {
-          lb3 = neginf;
-        }
-        if (lbmax == neginf)
-          lbmax = 0;
-
-        log_beta_a[t][s] = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax + log_probs_a[t][current_target_prime];
-        // one might check whether one can vectorize this better when done after the t-loop...
-	// now that we have beta, we fill in the sum of alpha*beta in eq (16)
-	// in contrast to the cuda implementation, we only parallelize over the batch, so we don't have a concurrency
-	// issue (several s can map to the same target character)
-        // collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s]
-        scalar_t log_alpha_beta =  log_alpha_a[t][s] + log_beta_a[t][s];
-        scalar_t &lcab = grad_a[t][current_target_prime];
-        if (lcab == neginf) {
-          lcab = log_alpha_beta;
-        } else {
-          scalar_t max = std::max(lcab, log_alpha_beta);
-          lcab = std::log(std::exp(lcab-max)+std::exp(log_alpha_beta-max))+max;
-        }
-      }
-    }
-
-    // now grad has the sum of eq (16)
-    // now we wrap up the calculation by adding in the remaining items of eq (16)
-    // this could be a great target for further vectorization.
-    // grad is the output gradient, nll is the loss. Note that the likelihood -nll is the Z of eq (16)
-    scalar_t nll = neg_log_likelihood.accessor<scalar_t, 1>()[b];
-    scalar_t gr =  grad_out.accessor<scalar_t, 1>()[b];
-    for (int64_t t = 0; t < input_length; t++) { // or go for the full thing?
-      for (int64_t c = 0; c < num_labels; c++) {
-        scalar_t& res = grad_a[t][c];
-        scalar_t lp = log_probs_a[t][c];
-        res = std::exp(lp)-std::exp(res + nll - lp) * gr;
-      }
-    }
-    // zero the remainder
-    if (input_length < max_input_length) {
-      grad.narrow(0, input_length, max_input_length - input_length).narrow(1, b, 1).zero_();
-    }
-  }
-  return grad;
-}
-
-} // namespace
-
-std::tuple<Tensor, Tensor> ctc_loss_cpu(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) {
-  return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss", [&] {
-      if (targets.type().scalarType() == kLong) {
-	return ctc_loss_cpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
-      } else {
-	return ctc_loss_cpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
-      }
-  });
-}
-
-Tensor ctc_loss_backward_cpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths,
-                             const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) {
-  return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss_backward", [&] {
-      if (targets.type().scalarType() == kLong) {
-	return ctc_loss_backward_cpu_template<scalar_t,kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
-      } else {
-	return ctc_loss_backward_cpu_template<scalar_t,kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
-      }
-  });
-}
-
-// this wrapper function dispatches to the native and cudnn implementations and hides the alpha/grad from the user (by just returning the loss)
-// the gradient is implemented for _cudnn_ctc_loss (just in derivatives.yaml) and _ctc_loss and this function has automatic gradients
-// it also handles the reduction if desired
-Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, int64_t reduction) {
-  auto& ctx = at::globalContext();
-
-  bool use_cudnn =
-    detail::getCUDAHooks().compiledWithCuDNN() &&
-    (detail::getCUDAHooks().versionCuDNN() >= 7000) &&
-    ctx.userEnabledCuDNN() &&
-    (BLANK == 0) && (targets.dim()==1) &&
-    (log_probs.type().scalarType() == at::kFloat) &&
-    (targets.type().scalarType() == at::kInt) &&
-    (log_probs.type().backend() == Backend::CUDA);
-
-  if (use_cudnn) {
-    // we don't know that input_lengths and target_lengths have the same size (they should, but we didn't check yet)
-    int64_t max_input_length = log_probs.size(0);
-    for (int64_t b = 0; b < input_lengths.size(); b++) {
-      use_cudnn &= (input_lengths[b] == max_input_length);
-    }
-    for (int64_t b = 0; b < target_lengths.size(); b++) {
-      use_cudnn &= (target_lengths[b] <= 256);
-    }
-  }
-
-  Tensor res;
-  if (use_cudnn) {
-    res = std::get<0>(at::_cudnn_ctc_loss(log_probs, targets, input_lengths, target_lengths, BLANK, ctx.deterministicCuDNN()));
-  } else {
-    res = std::get<0>(at::_ctc_loss(log_probs, targets, input_lengths, target_lengths, BLANK));
-  }
-  if (reduction == Reduction::ElementwiseMean) {
-    auto target_lengths_t = at::tensor(target_lengths, res.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(res.type());
-    return (res / target_lengths_t).mean();
-  } else if (reduction == Reduction::Sum) {
-    return res.sum();
-  }
-  return res;
-}
-
-} } // at::native
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index b84b9c3f36b3ea..d6ebbd4573a70c 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -141,9 +141,17 @@ Tensor& eye_out_cpu(Tensor& result, int64_t n) {
 }
 
 Tensor& eye_out_cpu(Tensor& result, int64_t n, int64_t m) {
+#ifndef USE_TH_SIZE_ZERO_DIM
+  AT_CHECK(n > 0, "n must be greater than 0, got ", n);
+#else
   AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
+#endif
 
+#ifndef USE_TH_SIZE_ZERO_DIM
+  if(m <= 0) {
+#else
   if(m < 0) {
+#endif
     m = n;
   }
 
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index be7e626fa1b748..f7ced03c5ab6fc 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -12,10 +12,6 @@
 namespace at {
 namespace native {
 
-std::vector<Tensor> broadcast_tensors(TensorList tensors) {
-  return expand_outplace(tensors);
-}
-
 static void check_cat_no_zero_dim(TensorList tensors) {
   for(size_t i = 0; i < tensors.size(); ++i) {
     auto& t = tensors[i];
@@ -82,6 +78,9 @@ Tensor diagonal(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_
   } else {
     diag_size = std::max<int64_t>(std::min(self.size(dim1)+offset, self.size(dim2)), 0);
   }
+#ifndef USE_TH_SIZE_ZERO_DIM
+  AT_CHECK(diag_size > 0, "invalid diagonal offset ", offset); // the diagonal offset was too large in magnitude
+#endif
 
   // NumPy allows you to specify offsets "off the end"; let's just be careful not to
   // set a ridiculous storage_offset in that case (technically it shouldn't matter
@@ -96,8 +95,8 @@ Tensor diagonal(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_
 
   // construct new size and stride: we drop dim1 and dim2 (maximum first for not changing the index of the minumum)
   // the new ("joint") dimension is appended to the end of the shape / stride to match numpy semantics
-  auto sizes = self.sizes().vec();
-  auto strides = self.strides().vec();
+  auto sizes = std::vector<int64_t>(self.sizes());
+  auto strides = std::vector<int64_t>(self.strides());
   sizes.erase(sizes.begin() + std::max(dim1, dim2));
   strides.erase(strides.begin() + std::max(dim1, dim2));
   sizes.erase(sizes.begin() + std::min(dim1, dim2));
@@ -158,7 +157,11 @@ Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   if (start != cur_size) {  // start being the end is valid, but not a valid dim specification.
     start = maybe_wrap_dim(start, cur_size);
   }
+#ifndef USE_TH_SIZE_ZERO_DIM
+  if (length <= 0 || start > cur_size - length) {
+#else
   if (length < 0 || start > cur_size - length) {
+#endif
     AT_ERROR("start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ").");
   }
   return at::slice(self, dim, start, start + length, 1);
@@ -243,6 +246,14 @@ static std::vector<int64_t> infer_size(IntList shape, int64_t numel) {
       AT_CHECK(newsize != 0, "cannot reshape tensor of 0 elements into shape ", shape);
       res[*infer_dim] = numel / newsize;
     }
+#ifndef USE_TH_SIZE_ZERO_DIM
+    if (numel == 0) {
+      // Collapse zero-element shapes into one dimension because TH handles zeros
+      // in sizes strangely: x.resize_(1, 0) has shape (1,). TODO: remove this
+      // once we have multi-dimensional empty tensors.
+      return {0};
+    }
+#endif
     return res;
   }
 
@@ -280,8 +291,8 @@ Tensor select(const Tensor& self, int64_t dim, int64_t index) {
   if (index < 0) {
     index += size;
   }
-  auto sizes = self.sizes().vec();
-  auto strides = self.strides().vec();
+  auto sizes = std::vector<int64_t>(self.sizes());
+  auto strides = std::vector<int64_t>(self.strides());
   auto storage_offset = self.storage_offset() + index * strides[dim];
   sizes.erase(sizes.begin() + dim);
   strides.erase(strides.begin() + dim);
@@ -292,8 +303,8 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_
   int64_t ndim = self.dim();
   AT_CHECK(ndim > 0, "slice() cannot be applied to a 0-dim tensor.");
   dim = maybe_wrap_dim(dim, ndim);
-  auto sizes = self.sizes().vec();
-  auto strides = self.strides().vec();
+  auto sizes = std::vector<int64_t>(self.sizes());
+  auto strides = std::vector<int64_t>(self.strides());
   if (step <= 0) {
     // TODO: support negative strides
     throw std::runtime_error("slice step must be positive");
@@ -316,6 +327,12 @@ Tensor slice(const Tensor& self, int64_t dim, int64_t start, int64_t end, int64_
   }
   auto storage_offset = self.storage_offset() + start * strides[dim];
   auto len = end - start;
+#ifndef USE_TH_SIZE_ZERO_DIM
+  if (len == 0) {
+    // TODO: currently we don't have support for 0-sized dims, return size 0 tensor for now
+    return self.type().tensor();
+  }
+#endif
   sizes[dim] = (len + step - 1) / step;  // round-up
   strides[dim] *= step;
   return self.as_strided(sizes, strides, storage_offset);
@@ -407,7 +424,7 @@ static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t di
   }
 
   if (self._indices().numel() == 0 && self._values().numel() == 0) {
-    auto sizes = self.sizes().vec();
+    std::vector<int64_t> sizes(self.sizes());
     std::swap(sizes[dim0], sizes[dim1]);
 
     return self.sparse_raw_resize_(sizes, self._sparseDims(), self._denseDims());
@@ -422,7 +439,7 @@ static inline Tensor & sparse_transpose_(Tensor & self, int64_t dim0, int64_t di
     row0.copy_(row1);
     row1.copy_(tmp);
 
-    auto sizes = self.sizes().vec();
+    std::vector<int64_t> sizes(self.sizes());
     std::swap(sizes[dim0], sizes[dim1]);
 
     return self.sparse_raw_resize_(sizes, -1, -1);
@@ -441,8 +458,8 @@ Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) {
     return sparse_transpose_(self, dim0, dim1);
   }
 
-  auto strides = self.strides().vec();
-  auto sizes = self.sizes().vec();
+  std::vector<int64_t> strides(self.strides());
+  std::vector<int64_t> sizes(self.sizes());
   std::swap(strides[dim0], strides[dim1]);
   std::swap(sizes[dim0], sizes[dim1]);
   return self.as_strided_(sizes, strides);
@@ -461,8 +478,8 @@ Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1) {
     return sparse_transpose_(self_clone, dim0, dim1);
   }
 
-  auto strides = self.strides().vec();
-  auto sizes = self.sizes().vec();
+  std::vector<int64_t> strides(self.strides());
+  std::vector<int64_t> sizes(self.sizes());
   std::swap(strides[dim0], strides[dim1]);
   std::swap(sizes[dim0], sizes[dim1]);
   return self.as_strided(sizes, strides);
@@ -522,8 +539,13 @@ inferSqueezeGeometry(const Tensor& tensor, int64_t dim) {
 
 std::tuple<std::vector<int64_t>, std::vector<int64_t> >
 inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) {
-  auto sizes = tensor.sizes().vec();
-  auto strides = tensor.strides().vec();
+#ifndef USE_TH_SIZE_ZERO_DIM
+  if (tensor.numel() == 0) {
+    throw std::runtime_error("cannot unsqueeze empty tensor");
+  }
+#endif
+  std::vector<int64_t> sizes(tensor.sizes());
+  std::vector<int64_t> strides(tensor.strides());
   int64_t new_stride = dim >= tensor.dim() ? 1 : sizes[dim] * strides[dim];
   sizes.insert(sizes.begin() + dim, 1);
   strides.insert(strides.begin() + dim, new_stride);
@@ -541,7 +563,7 @@ Tensor squeeze(const Tensor& self, int64_t dim) {
   dim = maybe_wrap_dim(dim, dims);
 
   if (dims == 0 || self.sizes()[dim] != 1) {
-    return self.as_strided(self.sizes(), self.strides());
+    return self.as_strided(self.sizes().vec(), self.strides().vec());
   }
   auto g = inferSqueezeGeometry(self, dim);
   return self.as_strided(std::get<0>(g), std::get<1>(g));
@@ -557,7 +579,7 @@ Tensor & squeeze_(Tensor& self, int64_t dim) {
   dim = maybe_wrap_dim(dim, self.dim());
 
   if (dims == 0 || self.sizes()[dim] != 1) {
-    return self.as_strided_(self.sizes(), self.strides());
+    return self.as_strided_(self.sizes().vec(), self.strides().vec());
   }
   auto g = inferSqueezeGeometry(self, dim);
   return self.as_strided_(std::get<0>(g), std::get<1>(g));
diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp
index 0648387b35d5ae..84759874ef5355 100644
--- a/aten/src/ATen/native/TensorTransformations.cpp
+++ b/aten/src/ATen/native/TensorTransformations.cpp
@@ -13,7 +13,7 @@ Tensor flip_cpu(const Tensor& self, IntList dims) {
   const int64_t total_dims = self.dim(), flip_dims_size = dims.size();
   flip_check_errors(total_dims, flip_dims_size, dims);
 
-  auto flip_dims_v = dims.vec();
+  auto flip_dims_v = std::vector<int64_t>(dims);
   wrap_all_dims(flip_dims_v, total_dims);
   std::sort(flip_dims_v.begin(), flip_dims_v.end());
   auto final_indices = std::vector<at::Tensor>(total_dims);
diff --git a/aten/src/ATen/native/TensorTransformations.h b/aten/src/ATen/native/TensorTransformations.h
index 9b8c7d62b585c6..2504a2c3f201b8 100644
--- a/aten/src/ATen/native/TensorTransformations.h
+++ b/aten/src/ATen/native/TensorTransformations.h
@@ -14,7 +14,7 @@ static inline void flip_check_errors(int64_t total_dims, int64_t flip_dims_size,
   AT_CHECK(flip_dims_size > 0 && flip_dims_size <= total_dims,
     "flip dims size out of range, got flip dims size=", flip_dims_size);
 
-  auto flip_dims_v = dims.vec();
+  auto flip_dims_v = std::vector<int64_t>(dims);
 
   // check if dims axis within range
   auto min_max_d = std::minmax_element(flip_dims_v.begin(), flip_dims_v.end());
diff --git a/aten/src/ATen/native/Vision.cpp b/aten/src/ATen/native/Vision.cpp
new file mode 100644
index 00000000000000..458e9aca23f0fe
--- /dev/null
+++ b/aten/src/ATen/native/Vision.cpp
@@ -0,0 +1,28 @@
+#include "ATen/ATen.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/detail/CUDAHooksInterface.h"
+
+namespace {
+  enum GridSamplerMode {GridSamplerModeZeros, GridSamplerModeBorder};
+}
+
+namespace at { namespace native {
+
+Tensor grid_sampler(const Tensor& input, const Tensor& grid, int64_t padding_mode) {
+  // cudnn does not support inputs larger than 1024
+  if (at::native::cudnn_is_acceptable(input) &&
+      padding_mode == GridSamplerModeZeros &&
+      input.dim() == 4 &&
+      input.size(1) <= 1024) {
+    return cudnn_grid_sampler(input, grid);
+  }
+  if (input.dim() == 4) {
+    return thnn_grid_sampler_bilinear2d(input, grid, padding_mode);
+  }
+  if (input.dim() == 5) {
+    return thnn_grid_sampler_bilinear3d(input, grid, padding_mode);
+  }
+  AT_ERROR("grid_sampler(): input must be 4d or 5d but got input of shape: ", input.dim());
+}
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/GridSampler.cu b/aten/src/ATen/native/cuda/GridSampler.cu
deleted file mode 100644
index a47865f2023474..00000000000000
--- a/aten/src/ATen/native/cuda/GridSampler.cu
+++ /dev/null
@@ -1,788 +0,0 @@
-#include "ATen/ATen.h"
-#include "ATen/native/GridSampler.h"
-#include "ATen/cuda/CUDAContext.h"
-#include "ATen/cuda/CUDAApplyUtils.cuh"
-#include "ATen/cuda/detail/TensorInfo.cuh"
-#include "ATen/cuda/detail/IndexUtils.cuh"
-#include "ATen/cuda/detail/KernelUtils.h"
-
-namespace at { namespace native {
-
-using namespace at::cuda::detail;
-
-using at::native::detail::GridSamplerInterpolation;
-using at::native::detail::GridSamplerPadding;
-
-namespace {
-  static __forceinline__ __device__
-  int clip_coordinates(int in, int clip_limit) {
-    return ::min(clip_limit - 1, ::max(in, static_cast<int>(0)));
-  }
-
-  static __forceinline__ __device__
-  bool within_bounds_2d(int h, int w, int H, int W) {
-    return h >= 0 && h < H && w >= 0 && w < W;
-  }
-
-  static __forceinline__ __device__
-  bool within_bounds_3d(int d, int h, int w, int D, int H, int W) {
-    return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
-  }
-
-  template<typename scalar_t>
-  static __forceinline__ __device__
-  void safe_add_2d(scalar_t *data, int h, int w,
-                   int sH, int sW, int H, int W,
-                   scalar_t delta) {
-    if (within_bounds_2d(h, w, H, W)) {
-      atomicAdd(data + h * sH + w * sW, delta);
-    }
-  }
-
-  template<typename scalar_t>
-  static __forceinline__ __device__
-  void safe_add_3d(scalar_t *data, int d, int h, int w,
-                   int sD, int sH, int sW, int D, int H, int W,
-                   scalar_t delta) {
-    if (within_bounds_3d(d, h, w, D, H, W)) {
-      atomicAdd(data + d * sD + h * sH + w * sW, delta);
-    }
-  }
-
-  template <typename scalar_t>
-  __launch_bounds__(1024)
-  __global__ void grid_sampler_2d_kernel(
-      const int nthreads,
-      TensorInfo<scalar_t, int> input,
-      TensorInfo<scalar_t, int> grid,
-      TensorInfo<scalar_t, int> output,
-      const GridSamplerPadding padding_mode) {
-
-    int C = input.sizes[1];
-    int inp_H = input.sizes[2];
-    int inp_W = input.sizes[3];
-    int out_H = grid.sizes[1];
-    int out_W = grid.sizes[2];
-    int inp_sN = input.strides[0];
-    int inp_sC = input.strides[1];
-    int inp_sH = input.strides[2];
-    int inp_sW = input.strides[3];
-    int grid_sN = grid.strides[0];
-    int grid_sH = grid.strides[1];
-    int grid_sW = grid.strides[2];
-    int grid_sCoor = grid.strides[3];
-    int out_sN = output.strides[0];
-    int out_sC = output.strides[1];
-    int out_sH = output.strides[2];
-    int out_sW = output.strides[3];
-
-    CUDA_KERNEL_LOOP(index, nthreads) {
-      const int w = index % out_W;
-      const int h = (index / out_W) % out_H;
-      const int n = index / (out_H * out_W);
-      const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-
-      // get the corresponding input x, y co-ordinates from grid
-      scalar_t ix = grid.data[grid_offset];
-      scalar_t iy = grid.data[grid_offset + grid_sCoor];
-
-      // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1]
-      float ixf = ((ix + 1.f) / 2) * (inp_W - 1);
-      float iyf = ((iy + 1.f) / 2) * (inp_H - 1);
-
-      ix = static_cast<scalar_t>(ixf);
-      iy = static_cast<scalar_t>(iyf);
-
-      // get NE, NW, SE, SW pixel values from (x, y)
-      int ix_nw = static_cast<int>(::floor(ixf));
-      int iy_nw = static_cast<int>(::floor(iyf));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-
-      // get surfaces to each neighbor:
-      scalar_t nw = (ix_se - ix)    * (iy_se - iy);
-      scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
-      scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
-      scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
-
-      // calculate bilinear weighted pixel value and set output pixel
-      if (padding_mode == GridSamplerPadding::Border) {
-        // clip coordinates to image borders
-        ix_nw = clip_coordinates(ix_nw, inp_W);
-        iy_nw = clip_coordinates(iy_nw, inp_H);
-        ix_ne = clip_coordinates(ix_ne, inp_W);
-        iy_ne = clip_coordinates(iy_ne, inp_H);
-        ix_sw = clip_coordinates(ix_sw, inp_W);
-        iy_sw = clip_coordinates(iy_sw, inp_H);
-        ix_se = clip_coordinates(ix_se, inp_W);
-        iy_se = clip_coordinates(iy_se, inp_H);
-      }
-
-      auto inp_ptr_NC = input.data + n * inp_sN;
-      auto out_ptr_NCHW = output.data + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        *out_ptr_NCHW = static_cast<scalar_t>(0);
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
-        }
-      }
-    }
-  }
-
-  template <typename scalar_t>
-  __launch_bounds__(1024)
-  __global__ void grid_sampler_3d_kernel(
-      const int nthreads,
-      TensorInfo<scalar_t, int> input,
-      TensorInfo<scalar_t, int> grid,
-      TensorInfo<scalar_t, int> output,
-      const GridSamplerPadding padding_mode) {
-
-    int C = input.sizes[1];
-    int inp_D = input.sizes[2];
-    int inp_H = input.sizes[3];
-    int inp_W = input.sizes[4];
-    int out_D = grid.sizes[1];
-    int out_H = grid.sizes[2];
-    int out_W = grid.sizes[3];
-    int inp_sN = input.strides[0];
-    int inp_sC = input.strides[1];
-    int inp_sD = input.strides[2];
-    int inp_sH = input.strides[3];
-    int inp_sW = input.strides[4];
-    int grid_sN = grid.strides[0];
-    int grid_sD = grid.strides[1];
-    int grid_sH = grid.strides[2];
-    int grid_sW = grid.strides[3];
-    int grid_sCoor = grid.strides[4];
-    int out_sN = output.strides[0];
-    int out_sC = output.strides[1];
-    int out_sD = output.strides[2];
-    int out_sH = output.strides[3];
-    int out_sW = output.strides[4];
-
-    CUDA_KERNEL_LOOP(index, nthreads) {
-      const int w = index % out_W;
-      const int h = (index / out_W) % out_H;
-      const int d = (index / (out_H * out_W)) % out_D;
-      const int n = index / (out_D * out_H * out_W);
-      const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
-
-      // get the corresponding input x, y, z co-ordinates from grid
-      scalar_t ix = grid.data[grid_offset];
-      scalar_t iy = grid.data[grid_offset + grid_sCoor];
-      scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor];
-
-      // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1]
-      float ixf = ((ix + 1.f) / 2) * (inp_W - 1);
-      float iyf = ((iy + 1.f) / 2) * (inp_H - 1);
-      float izf = ((iz + 1.f) / 2) * (inp_D - 1);
-
-      ix = static_cast<scalar_t>(ixf);
-      iy = static_cast<scalar_t>(iyf);
-      iz = static_cast<scalar_t>(izf);
-
-      // get corner pixel values from (x, y, z)
-      // for 4d, we used north-east-south-west
-      // for 5d, we add top-bottom
-      int ix_tnw = static_cast<int>(::floor(ix));
-      int iy_tnw = static_cast<int>(::floor(iy));
-      int iz_tnw = static_cast<int>(::floor(iz));
-
-      int ix_tne = ix_tnw + 1;
-      int iy_tne = iy_tnw;
-      int iz_tne = iz_tnw;
-
-      int ix_tsw = ix_tnw;
-      int iy_tsw = iy_tnw + 1;
-      int iz_tsw = iz_tnw;
-
-      int ix_tse = ix_tnw + 1;
-      int iy_tse = iy_tnw + 1;
-      int iz_tse = iz_tnw;
-
-      int ix_bnw = ix_tnw;
-      int iy_bnw = iy_tnw;
-      int iz_bnw = iz_tnw + 1;
-
-      int ix_bne = ix_tnw + 1;
-      int iy_bne = iy_tnw;
-      int iz_bne = iz_tnw + 1;
-
-      int ix_bsw = ix_tnw;
-      int iy_bsw = iy_tnw + 1;
-      int iz_bsw = iz_tnw + 1;
-
-      int ix_bse = ix_tnw + 1;
-      int iy_bse = iy_tnw + 1;
-      int iz_bse = iz_tnw + 1;
-
-      // get surfaces to each neighbor:
-      scalar_t tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
-      scalar_t tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
-      scalar_t tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
-      scalar_t tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
-      scalar_t bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
-      scalar_t bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
-      scalar_t bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
-      scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
-
-      if (padding_mode == GridSamplerPadding::Border) {
-        // clip coordinates to image borders
-        ix_tnw = clip_coordinates(ix_tnw, inp_W);
-        iy_tnw = clip_coordinates(iy_tnw, inp_H);
-        iz_tnw = clip_coordinates(iz_tnw, inp_D);
-        ix_tne = clip_coordinates(ix_tne, inp_W);
-        iy_tne = clip_coordinates(iy_tne, inp_H);
-        iz_tne = clip_coordinates(iz_tne, inp_D);
-        ix_tsw = clip_coordinates(ix_tsw, inp_W);
-        iy_tsw = clip_coordinates(iy_tsw, inp_H);
-        iz_tsw = clip_coordinates(iz_tsw, inp_D);
-        ix_tse = clip_coordinates(ix_tse, inp_W);
-        iy_tse = clip_coordinates(iy_tse, inp_H);
-        iz_tse = clip_coordinates(iz_tse, inp_D);
-        ix_bnw = clip_coordinates(ix_bnw, inp_W);
-        iy_bnw = clip_coordinates(iy_bnw, inp_H);
-        iz_bnw = clip_coordinates(iz_bnw, inp_D);
-        ix_bne = clip_coordinates(ix_bne, inp_W);
-        iy_bne = clip_coordinates(iy_bne, inp_H);
-        iz_bne = clip_coordinates(iz_bne, inp_D);
-        ix_bsw = clip_coordinates(ix_bsw, inp_W);
-        iy_bsw = clip_coordinates(iy_bsw, inp_H);
-        iz_bsw = clip_coordinates(iz_bsw, inp_D);
-        ix_bse = clip_coordinates(ix_bse, inp_W);
-        iy_bse = clip_coordinates(iy_bse, inp_H);
-        iz_bse = clip_coordinates(iz_bse, inp_D);
-      }
-
-      auto inp_ptr_NC = input.data + n * inp_sN;
-      auto out_ptr_NCDHW = output.data + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
-      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
-        //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne
-        // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse
-        // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne
-        // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse
-        *out_ptr_NCDHW = static_cast<scalar_t>(0);
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
-        }
-      }
-    }
-  }
-
-  template <typename scalar_t>
-  __launch_bounds__(1024)
-  __global__ void grid_sampler_2d_backward_kernel(
-      const int nthreads,
-      TensorInfo<scalar_t, int> grad_output,
-      TensorInfo<scalar_t, int> input,
-      TensorInfo<scalar_t, int> grid,
-      TensorInfo<scalar_t, int> grad_input,  // initialized to zeros
-      TensorInfo<scalar_t, int> grad_grid,   // initialized to empty
-      const GridSamplerPadding padding_mode) {
-
-    int C = input.sizes[1];
-    int inp_H = input.sizes[2];
-    int inp_W = input.sizes[3];
-    int out_H = grid.sizes[1];
-    int out_W = grid.sizes[2];
-    int inp_sN = input.strides[0];
-    int inp_sC = input.strides[1];
-    int inp_sH = input.strides[2];
-    int inp_sW = input.strides[3];
-    int grid_sN = grid.strides[0];
-    int grid_sH = grid.strides[1];
-    int grid_sW = grid.strides[2];
-    int grid_sCoor = grid.strides[3];
-    int gOut_sN = grad_output.strides[0];
-    int gOut_sC = grad_output.strides[1];
-    int gOut_sH = grad_output.strides[2];
-    int gOut_sW = grad_output.strides[3];
-    int gInp_sN = grad_input.strides[0];
-    int gInp_sC = grad_input.strides[1];
-    int gInp_sH = grad_input.strides[2];
-    int gInp_sW = grad_input.strides[3];
-    int gGrid_sW = grad_grid.strides[2];
-
-    CUDA_KERNEL_LOOP(index, nthreads) {
-      const int w = index % out_W;
-      const int h = (index / out_W) % out_H;
-      const int n = index / (out_H * out_W);
-      const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-
-      // get the corresponding input x, y co-ordinates from grid
-      scalar_t ix = grid.data[grid_offset];
-      scalar_t iy = grid.data[grid_offset + grid_sCoor];
-
-      // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1]
-      float ixf = ((ix + 1.f) / 2) * (inp_W - 1);
-      float iyf = ((iy + 1.f) / 2) * (inp_H - 1);
-
-      ix = static_cast<scalar_t>(ixf);
-      iy = static_cast<scalar_t>(iyf);
-
-      // get NE, NW, SE, SW pixel values from (x, y)
-      int ix_nw = static_cast<int>(::floor(ixf));
-      int iy_nw = static_cast<int>(::floor(iyf));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-
-      // get surfaces to each neighbor:
-      scalar_t nw = (ix_se - ix)    * (iy_se - iy);
-      scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
-      scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
-      scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
-
-      int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl;
-
-      // calculate bilinear weighted pixel value and set output pixel
-      if (padding_mode == GridSamplerPadding::Border) {
-        // clip coordinates to image borders
-        ix_nw_cl = clip_coordinates(ix_nw, inp_W);
-        iy_nw_cl = clip_coordinates(iy_nw, inp_H);
-        ix_ne_cl = clip_coordinates(ix_ne, inp_W);
-        iy_ne_cl = clip_coordinates(iy_ne, inp_H);
-        ix_sw_cl = clip_coordinates(ix_sw, inp_W);
-        iy_sw_cl = clip_coordinates(iy_sw, inp_H);
-        ix_se_cl = clip_coordinates(ix_se, inp_W);
-        iy_se_cl = clip_coordinates(iy_se, inp_H);
-      } else {
-        ix_nw_cl = ix_nw;
-        iy_nw_cl = iy_nw;
-        ix_ne_cl = ix_ne;
-        iy_ne_cl = iy_ne;
-        ix_sw_cl = ix_sw;
-        iy_sw_cl = iy_sw;
-        ix_se_cl = ix_se;
-        iy_se_cl = iy_se;
-      }
-
-      scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0);
-      scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW;
-      scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN;
-      scalar_t *inp_ptr_NC = input.data + n * inp_sN;
-      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, gInp_ptr_NC += gInp_sC, gOut_ptr_NCHW += gOut_sC) {
-        scalar_t gOut = *gOut_ptr_NCHW;
-
-        // calculate and set grad_input
-        safe_add_2d(gInp_ptr_NC, iy_nw_cl, ix_nw_cl, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut);
-        safe_add_2d(gInp_ptr_NC, iy_ne_cl, ix_ne_cl, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut);
-        safe_add_2d(gInp_ptr_NC, iy_sw_cl, ix_sw_cl, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut);
-        safe_add_2d(gInp_ptr_NC, iy_se_cl, ix_se_cl, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut);
-
-        // calculate grad_grid
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_nw_cl, ix_nw_cl, inp_H, inp_W)) {
-          scalar_t nw_val = inp_ptr_NC[iy_nw_cl * inp_sH + ix_nw_cl * inp_sW];
-          gix -= nw_val * (iy_se - iy) * gOut;
-          giy -= nw_val * (ix_se - ix) * gOut;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_ne_cl, ix_ne_cl, inp_H, inp_W)) {
-          scalar_t ne_val = inp_ptr_NC[iy_ne_cl * inp_sH + ix_ne_cl * inp_sW];
-          gix += ne_val * (iy_sw - iy) * gOut;
-          giy -= ne_val * (ix - ix_sw) * gOut;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_sw_cl, ix_sw_cl, inp_H, inp_W)) {
-          scalar_t sw_val = inp_ptr_NC[iy_sw_cl * inp_sH + ix_sw_cl * inp_sW];
-          gix -= sw_val * (iy - iy_ne) * gOut;
-          giy += sw_val * (ix_ne - ix) * gOut;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_2d(iy_se_cl, ix_se_cl, inp_H, inp_W)) {
-          scalar_t se_val = inp_ptr_NC[iy_se_cl * inp_sH + ix_se_cl * inp_sW];
-          gix += se_val * (iy - iy_nw) * gOut;
-          giy += se_val * (ix - ix_nw) * gOut;
-        }
-      }
-
-      // un-normalize grad_grid values back to [-1, 1] constraints
-      gix = gix * (inp_W - 1.f) / 2;
-      giy = giy * (inp_H - 1.f) / 2;
-
-      // assuming grad_grid is contiguous
-      // thus we can
-      //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NHW
-      //   2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1]
-      scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
-      gGrid_ptr_NHW[0] = gix;
-      gGrid_ptr_NHW[1] = giy;
-    }
-  }
-
-  template <typename scalar_t>
-  __launch_bounds__(1024)
-  __global__ void grid_sampler_3d_backward_kernel(
-      const int nthreads,
-      TensorInfo<scalar_t, int> grad_output,
-      TensorInfo<scalar_t, int> input,
-      TensorInfo<scalar_t, int> grid,
-      TensorInfo<scalar_t, int> grad_input,  // initialized to zeros
-      TensorInfo<scalar_t, int> grad_grid,   // initialized to empty
-      const GridSamplerPadding padding_mode) {
-
-    int C = input.sizes[1];
-    int inp_D = input.sizes[2];
-    int inp_H = input.sizes[3];
-    int inp_W = input.sizes[4];
-    int out_D = grid.sizes[1];
-    int out_H = grid.sizes[2];
-    int out_W = grid.sizes[3];
-    int inp_sN = input.strides[0];
-    int inp_sC = input.strides[1];
-    int inp_sD = input.strides[2];
-    int inp_sH = input.strides[3];
-    int inp_sW = input.strides[4];
-    int grid_sN = grid.strides[0];
-    int grid_sD = grid.strides[1];
-    int grid_sH = grid.strides[2];
-    int grid_sW = grid.strides[3];
-    int grid_sCoor = grid.strides[4];
-    int gOut_sN = grad_output.strides[0];
-    int gOut_sC = grad_output.strides[1];
-    int gOut_sD = grad_output.strides[2];
-    int gOut_sH = grad_output.strides[3];
-    int gOut_sW = grad_output.strides[4];
-    int gInp_sN = grad_input.strides[0];
-    int gInp_sC = grad_input.strides[1];
-    int gInp_sD = grad_input.strides[2];
-    int gInp_sH = grad_input.strides[3];
-    int gInp_sW = grad_input.strides[4];
-    int gGrid_sW = grad_grid.strides[3];
-
-    CUDA_KERNEL_LOOP(index, nthreads) {
-      const int w = index % out_W;
-      const int h = (index / out_W) % out_H;
-      const int d = (index / (out_H * out_W)) % out_D;
-      const int n = index / (out_D * out_H * out_W);
-      const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
-
-      // get the corresponding input x, y, z co-ordinates from grid
-      scalar_t ix = grid.data[grid_offset];
-      scalar_t iy = grid.data[grid_offset + grid_sCoor];
-      scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor];
-
-      // normalize ix, iy, iz from [-1, 1] to [0, inp_W-1] & [0, inp_H-1] & [0, inp_D-1]
-      float ixf = ((ix + 1.f) / 2) * (inp_W - 1);
-      float iyf = ((iy + 1.f) / 2) * (inp_H - 1);
-      float izf = ((iz + 1.f) / 2) * (inp_D - 1);
-
-      ix = static_cast<scalar_t>(ixf);
-      iy = static_cast<scalar_t>(iyf);
-      iz = static_cast<scalar_t>(izf);
-
-      // get corner pixel values from (x, y, z)
-      // for 4d, we used north-east-south-west
-      // for 5d, we add top-bottom
-      int ix_tnw = static_cast<int>(::floor(ix));
-      int iy_tnw = static_cast<int>(::floor(iy));
-      int iz_tnw = static_cast<int>(::floor(iz));
-
-      int ix_tne = ix_tnw + 1;
-      int iy_tne = iy_tnw;
-      int iz_tne = iz_tnw;
-
-      int ix_tsw = ix_tnw;
-      int iy_tsw = iy_tnw + 1;
-      int iz_tsw = iz_tnw;
-
-      int ix_tse = ix_tnw + 1;
-      int iy_tse = iy_tnw + 1;
-      int iz_tse = iz_tnw;
-
-      int ix_bnw = ix_tnw;
-      int iy_bnw = iy_tnw;
-      int iz_bnw = iz_tnw + 1;
-
-      int ix_bne = ix_tnw + 1;
-      int iy_bne = iy_tnw;
-      int iz_bne = iz_tnw + 1;
-
-      int ix_bsw = ix_tnw;
-      int iy_bsw = iy_tnw + 1;
-      int iz_bsw = iz_tnw + 1;
-
-      int ix_bse = ix_tnw + 1;
-      int iy_bse = iy_tnw + 1;
-      int iz_bse = iz_tnw + 1;
-
-      // get surfaces to each neighbor:
-      scalar_t tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
-      scalar_t tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
-      scalar_t tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
-      scalar_t tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
-      scalar_t bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
-      scalar_t bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
-      scalar_t bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
-      scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
-
-      int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl;
-      int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl;
-      int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl;
-      int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl;
-
-      if (padding_mode == GridSamplerPadding::Border) {
-        // clip coordinates to image borders
-        ix_tnw_cl = clip_coordinates(ix_tnw, inp_W);
-        iy_tnw_cl = clip_coordinates(iy_tnw, inp_H);
-        iz_tnw_cl = clip_coordinates(iz_tnw, inp_D);
-        ix_tne_cl = clip_coordinates(ix_tne, inp_W);
-        iy_tne_cl = clip_coordinates(iy_tne, inp_H);
-        iz_tne_cl = clip_coordinates(iz_tne, inp_D);
-        ix_tsw_cl = clip_coordinates(ix_tsw, inp_W);
-        iy_tsw_cl = clip_coordinates(iy_tsw, inp_H);
-        iz_tsw_cl = clip_coordinates(iz_tsw, inp_D);
-        ix_tse_cl = clip_coordinates(ix_tse, inp_W);
-        iy_tse_cl = clip_coordinates(iy_tse, inp_H);
-        iz_tse_cl = clip_coordinates(iz_tse, inp_D);
-        ix_bnw_cl = clip_coordinates(ix_bnw, inp_W);
-        iy_bnw_cl = clip_coordinates(iy_bnw, inp_H);
-        iz_bnw_cl = clip_coordinates(iz_bnw, inp_D);
-        ix_bne_cl = clip_coordinates(ix_bne, inp_W);
-        iy_bne_cl = clip_coordinates(iy_bne, inp_H);
-        iz_bne_cl = clip_coordinates(iz_bne, inp_D);
-        ix_bsw_cl = clip_coordinates(ix_bsw, inp_W);
-        iy_bsw_cl = clip_coordinates(iy_bsw, inp_H);
-        iz_bsw_cl = clip_coordinates(iz_bsw, inp_D);
-        ix_bse_cl = clip_coordinates(ix_bse, inp_W);
-        iy_bse_cl = clip_coordinates(iy_bse, inp_H);
-        iz_bse_cl = clip_coordinates(iz_bse, inp_D);
-      } else {
-        ix_tnw_cl = ix_tnw;
-        iy_tnw_cl = iy_tnw;
-        iz_tnw_cl = iz_tnw;
-        ix_tne_cl = ix_tne;
-        iy_tne_cl = iy_tne;
-        iz_tne_cl = iz_tne;
-        ix_tsw_cl = ix_tsw;
-        iy_tsw_cl = iy_tsw;
-        iz_tsw_cl = iz_tsw;
-        ix_tse_cl = ix_tse;
-        iy_tse_cl = iy_tse;
-        iz_tse_cl = iz_tse;
-        ix_bnw_cl = ix_bnw;
-        iy_bnw_cl = iy_bnw;
-        iz_bnw_cl = iz_bnw;
-        ix_bne_cl = ix_bne;
-        iy_bne_cl = iy_bne;
-        iz_bne_cl = iz_bne;
-        ix_bsw_cl = ix_bsw;
-        iy_bsw_cl = iy_bsw;
-        iz_bsw_cl = iz_bsw;
-        ix_bse_cl = ix_bse;
-        iy_bse_cl = iy_bse;
-        iz_bse_cl = iz_bse;
-      }
-
-      scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0), giz = static_cast<scalar_t>(0);
-      scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
-      scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN;
-      scalar_t *inp_ptr_NC = input.data + n * inp_sN;
-      // calculate bilinear weighted pixel value and set output pixel
-      for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
-        scalar_t gOut = *gOut_ptr_NCDHW;
-
-        // calculate and set grad_input
-        safe_add_3d(gInp_ptr_NC, iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut);
-        safe_add_3d(gInp_ptr_NC, iz_tne_cl, iy_tne_cl, ix_tne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut);
-        safe_add_3d(gInp_ptr_NC, iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut);
-        safe_add_3d(gInp_ptr_NC, iz_tse_cl, iy_tse_cl, ix_tse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut);
-        safe_add_3d(gInp_ptr_NC, iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut);
-        safe_add_3d(gInp_ptr_NC, iz_bne_cl, iy_bne_cl, ix_bne_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut);
-        safe_add_3d(gInp_ptr_NC, iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut);
-        safe_add_3d(gInp_ptr_NC, iz_bse_cl, iy_bse_cl, ix_bse_cl, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut);
-
-        // calculate grad_grid
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tnw_cl, iy_tnw_cl, ix_tnw_cl, inp_D, inp_H, inp_W)) {
-          scalar_t tnw_val = inp_ptr_NC[iz_tnw_cl * inp_sD + iy_tnw_cl * inp_sH + ix_tnw_cl * inp_sW];
-          gix -= tnw_val * (iy_bse - iy)    * (iz_bse - iz)    * gOut;
-          giy -= tnw_val * (ix_bse - ix)    * (iz_bse - iz)    * gOut;
-          giz -= tnw_val * (ix_bse - ix)    * (iy_bse - iy)    * gOut;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tne_cl, iy_tne_cl, ix_tne_cl, inp_D, inp_H, inp_W)) {
-          scalar_t tne_val = inp_ptr_NC[iz_tne_cl * inp_sD + iy_tne_cl * inp_sH + ix_tne_cl * inp_sW];
-          gix += tne_val * (iy_bsw - iy)    * (iz_bsw - iz)    * gOut;
-          giy -= tne_val * (ix    - ix_bsw) * (iz_bsw - iz)    * gOut;
-          giz -= tne_val * (ix    - ix_bsw) * (iy_bsw - iy)    * gOut;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tsw_cl, iy_tsw_cl, ix_tsw_cl, inp_D, inp_H, inp_W)) {
-          scalar_t tsw_val = inp_ptr_NC[iz_tsw_cl * inp_sD + iy_tsw_cl * inp_sH + ix_tsw_cl * inp_sW];
-          gix -= tsw_val * (iy - iy_bne)    * (iz_bne - iz)    * gOut;
-          giy += tsw_val * (ix_bne - ix)    * (iz_bne - iz)    * gOut;
-          giz -= tsw_val * (ix_bne - ix)    * (iy    - iy_bne) * gOut;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_tse_cl, iy_tse_cl, ix_tse_cl, inp_D, inp_H, inp_W)) {
-          scalar_t tse_val = inp_ptr_NC[iz_tse_cl * inp_sD + iy_tse_cl * inp_sH + ix_tse_cl * inp_sW];
-          gix += tse_val * (iy - iy_bnw)    * (iz_bnw - iz)    * gOut;
-          giy += tse_val * (ix    - ix_bnw) * (iz_bnw - iz)    * gOut;
-          giz -= tse_val * (ix    - ix_bnw) * (iy    - iy_bnw) * gOut;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bnw_cl, iy_bnw_cl, ix_bnw_cl, inp_D, inp_H, inp_W)) {
-          scalar_t bnw_val = inp_ptr_NC[iz_bnw_cl * inp_sD + iy_bnw_cl * inp_sH + ix_bnw_cl * inp_sW];
-          gix -= bnw_val * (iy_tse - iy)    * (iz - iz_tse)    * gOut;
-          giy -= bnw_val * (ix_tse - ix)    * (iz - iz_tse)    * gOut;
-          giz += bnw_val * (ix_tse - ix)    * (iy_tse - iy)    * gOut;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bne_cl, iy_bne_cl, ix_bne_cl, inp_D, inp_H, inp_W)) {
-          scalar_t bne_val = inp_ptr_NC[iz_bne_cl * inp_sD + iy_bne_cl * inp_sH + ix_bne_cl * inp_sW];
-          gix += bne_val * (iy_tsw - iy)    * (iz - iz_tsw)    * gOut;
-          giy -= bne_val * (ix    - ix_tsw) * (iz - iz_tsw)    * gOut;
-          giz += bne_val * (ix    - ix_tsw) * (iy_tsw - iy)    * gOut;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bsw_cl, iy_bsw_cl, ix_bsw_cl, inp_D, inp_H, inp_W)) {
-          scalar_t bsw_val = inp_ptr_NC[iz_bsw_cl * inp_sD + iy_bsw_cl * inp_sH + ix_bsw_cl * inp_sW];
-          gix -= bsw_val * (iy - iy_tne)    * (iz - iz_tne)    * gOut;
-          giy += bsw_val * (ix_tne - ix)    * (iz - iz_tne)    * gOut;
-          giz += bsw_val * (ix_tne - ix)    * (iy    - iy_tne) * gOut;
-        }
-        if (padding_mode != GridSamplerPadding::Zeros || within_bounds_3d(iz_bse_cl, iy_bse_cl, ix_bse_cl, inp_D, inp_H, inp_W)) {
-          scalar_t bse_val = inp_ptr_NC[iz_bse_cl * inp_sD + iy_bse_cl * inp_sH + ix_bse_cl * inp_sW];
-          gix += bse_val * (iy - iy_tnw)    * (iz - iz_tnw)    * gOut;
-          giy += bse_val * (ix    - ix_tnw) * (iz - iz_tnw)    * gOut;
-          giz += bse_val * (ix    - ix_tnw) * (iy    - iy_tnw) * gOut;
-        }
-      }
-
-      // un-normalize grad_grid values back to [-1, 1] constraints
-      gix = gix * (inp_W - 1) / 2;
-      giy = giy * (inp_H - 1) / 2;
-      giz = giz * (inp_D - 1) / 2;
-
-      // assuming grad_grid is contiguous
-      // thus we can
-      //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NDHW
-      //   2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2]
-      scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW;
-      gGrid_ptr_NDHW[0] = gix;
-      gGrid_ptr_NDHW[1] = giy;
-      gGrid_ptr_NDHW[2] = giz;
-    }
-  }
-}  // namespace
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid,
-                            int64_t interpolation_mode, int64_t padding_mode) {
-  auto N = input.size(0);
-  auto H = grid.size(1);
-  auto W = grid.size(2);
-  auto output = at::empty({N, input.size(1), H, W}, input.options());
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_cuda", [&] {
-    int count = static_cast<int>(N * H * W);
-    grid_sampler_2d_kernel<scalar_t>
-      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-        count,
-        getTensorInfo<scalar_t, int>(input),
-        getTensorInfo<scalar_t, int>(grid),
-        getTensorInfo<scalar_t, int>(output),
-        static_cast<GridSamplerPadding>(padding_mode));
-  });
-  return output;
-}
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid,
-                            int64_t interpolation_mode, int64_t padding_mode) {
-  auto N = input.size(0);
-  auto D = grid.size(1);
-  auto H = grid.size(2);
-  auto W = grid.size(3);
-  auto output = at::empty({N, input.size(1), D, H, W}, input.options());
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_cuda", [&] {
-    int count = static_cast<int>(N * D * H * W);
-    grid_sampler_3d_kernel<scalar_t>
-      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-        count,
-        getTensorInfo<scalar_t, int>(input),
-        getTensorInfo<scalar_t, int>(grid),
-        getTensorInfo<scalar_t, int>(output),
-        static_cast<GridSamplerPadding>(padding_mode));
-  });
-  return output;
-}
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
-                              int64_t interpolation_mode, int64_t padding_mode) {
-  auto N = input.size(0);
-  auto H = grid.size(1);
-  auto W = grid.size(2);
-  auto grad_input = at::zeros_like(input);
-  auto grad_grid = at::empty_like(grid);
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_2d_backward_cuda", [&] {
-    int count = static_cast<int>(N * H * W);
-    grid_sampler_2d_backward_kernel<scalar_t>
-      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-        count,
-        getTensorInfo<scalar_t, int>(grad_output),
-        getTensorInfo<scalar_t, int>(input),
-        getTensorInfo<scalar_t, int>(grid),
-        getTensorInfo<scalar_t, int>(grad_input),
-        getTensorInfo<scalar_t, int>(grad_grid),
-        static_cast<GridSamplerPadding>(padding_mode));
-  });
-  return std::make_tuple(grad_input, grad_grid);
-}
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
-                              int64_t interpolation_mode, int64_t padding_mode) {
-  auto N = input.size(0);
-  auto D = grid.size(1);
-  auto H = grid.size(2);
-  auto W = grid.size(3);
-  auto grad_input = at::zeros_like(input);
-  auto grad_grid = at::empty_like(grid);
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "grid_sampler_3d_backward_cuda", [&] {
-    int count = static_cast<int>(N * D * H * W);
-    grid_sampler_3d_backward_kernel<scalar_t>
-      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-        count,
-        getTensorInfo<scalar_t, int>(grad_output),
-        getTensorInfo<scalar_t, int>(input),
-        getTensorInfo<scalar_t, int>(grid),
-        getTensorInfo<scalar_t, int>(grad_input),
-        getTensorInfo<scalar_t, int>(grad_grid),
-        static_cast<GridSamplerPadding>(padding_mode));
-  });
-  return std::make_tuple(grad_input, grad_grid);
-}
-
-}}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
index 12f22fcaf2f216..4b474e0c079e77 100644
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@@ -76,9 +76,6 @@ void gpu_nullary_kernel(TensorIterator& iter, const func_t& f) {
   using arg0_t = typename traits::result_type;
 
   int64_t numel = iter.numel();
-  if (numel == 0) {
-    return;
-  }
   if (iter.is_trivial_1d()) {
     auto strides = iter.get_inner_strides();
     int stride0 = strides[0];
@@ -108,9 +105,6 @@ void gpu_unary_kernel(TensorIterator& iter, const func_t& f) {
   using arg1_t = typename traits::arg1_t;
 
   int64_t numel = iter.numel();
-  if (numel == 0) {
-    return;
-  }
   if (iter.is_cpu_scalar(1)) {
     auto a = iter.scalar_value<arg1_t>(1);
     iter.remove_operand(1);
@@ -158,9 +152,6 @@ void gpu_binary_kernel(TensorIterator& iter, const func_t& f) {
   using arg2_t = typename traits::arg2_t;
 
   int numel = iter.numel();
-  if (numel == 0) {
-    return;
-  }
   if (iter.is_cpu_scalar(1)) {
     auto a = iter.scalar_value<arg1_t>(1);
     iter.remove_operand(1);
diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu
deleted file mode 100644
index 70ece3f4440cf7..00000000000000
--- a/aten/src/ATen/native/cuda/LossCTC.cu
+++ /dev/null
@@ -1,625 +0,0 @@
-// Copyright (c) 2018 MathInf GmbH, Thomas Viehmann
-// Licensed under the BSD-3-Clause license
-// This is the GPU implementation of the Connectionist Temporal Loss.
-// We mostly follow Graves.
-// 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf
-// We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based.
-// Graves et al call the probabilities y, we use log_probs (also calling them inputs)
-// A few optimizations (simmilar to those here, but also some I didn't take) are described in
-// 2. Minmin Sun: http://on-demand.gputechconf.com/gtc/2016/presentation/s6383-minmin-sun-speech-recognition.pdf
-
-#include <ATen/TensorUtils.h>
-#include <ATen/Error.h>
-
-#include <ATen/ATen.h>
-#include "ATen/Dispatch.h"
-#include "ATen/cuda/CUDAApplyUtils.cuh"
-
-#include <type_traits>
-#include <numeric>
-
-namespace at {
-namespace native {
-
-namespace {
-
-// this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done
-// __restrict__ impact to be measured, https://devblogs.nvidia.com/cuda-pro-tip-optimize-pointer-aliasing/
-template<typename target_t>
-__device__ static inline int64_t get_target_prime(const target_t* __restrict__ target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) {
-  if (idx % 2 == 0) {
-    return BLANK;
-  } else {
-    return target[offset + stride * (idx / 2)];
-  }
-}
-
-// this kernel is a relatively straightforward implementation of the alpha calculation in the forward backward algorithm (section 4.1).
-// A (minor) twist is that we are using log-calculations to enhance numerical stability (log_probs and log_alpha).
-// In total it would be more efficient to compute the beta in the same kernel (e.g. cudnn does this). While the beta are not
-// needed for the loss itself (just the grad), we can return log_alpha+log_beta (so same space as currently) and the overhead
-// is small and the use-case for loss without grad is relatively limited.
-// We parallelize by batch and target sequence. Empirically, it is faster to loop over the input (log probs) sequence  and do
-// target in parallel, even if it means more frequent __syncthreads.
-// In contrast to the cuDNN implementation, we allow large target lengths. For this we need that all previous `s` have been
-// computed when we start a new block_s. This is why we have our own for loop here.
-template<typename scalar_t, typename target_t>
-__global__ void ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
-                                    const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
-                                    const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
-                                    scalar_t* __restrict__ neg_log_likelihood_data,
-                                    int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
-                                    int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
-                                    const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
-                                    int64_t batch_size, int64_t BLANK) {
-
-  constexpr scalar_t neginf = -INFINITY;
-
-  // bookkeeping
-  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
-  int64_t input_length = input_lengths[b];
-  int64_t target_length = target_lengths[b];
-  int64_t lp_batch_offset = b*lp_batch_stride;
-  int64_t la_batch_offset = b*la_batch_stride;
-  int64_t tg_batch_offset = tg_batch_offsets[b];
-
-  if (b >= batch_size)
-    return;
-
-  // first row (t=0), the three equations for alpha_1 above eq (6)
-  for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) {
-    int64_t s = threadIdx.x + block_s;
-    scalar_t la;
-    switch (s) {
-    case 0:
-      la = log_probs_data[lp_batch_offset + lp_char_stride * BLANK];
-      break;
-    case 1:
-      if (target_length > 0) {
-        la = log_probs_data[lp_batch_offset + lp_char_stride * get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)];
-      }
-      else {
-        la = neginf;
-      }
-      break;
-    default:
-      la = neginf;
-    }
-    if (s < 2*max_target_length+1)
-      log_alpha_data[la_batch_offset + /* la_input_stride * 0 */ + la_target_stride * s] = la;
-  }
-
-  for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) {
-    int64_t s = threadIdx.x + block_s;
-
-    // These two only depend on s, so we can cache them.
-    int64_t current_char;       // l_s in eq (6)
-    bool have_three;            // flag which of the two cases in eq (6) we have
-    if (s < 2*target_length+1) {
-      current_char = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
-      have_three = ((s > 1) && (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s-2, BLANK) !=
-				current_char));
-    } else {
-      current_char = BLANK;
-      have_three = false;
-    }
-    for (int64_t t=1; t < max_input_length; t++) {
-      __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch
-      if ((t < input_length) && (target_length > 0) && (s < 2*target_length+1)) {
-	// only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands,
-	// lamax is the maximum for the logsumexp trick.
-        scalar_t la1 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * s];
-        scalar_t lamax = la1;
-        scalar_t la2, la3;
-        if (s > 0) {
-          la2 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * (s-1)];
-          if (la2 > lamax)
-            lamax = la2;
-        } else {
-          la2 = neginf;
-        }
-        if (have_three) {
-          la3 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * (s-2)];
-          if (la3 > lamax)
-            lamax = la3;
-        } else {
-          la3 = neginf;
-        }
-        if (lamax == neginf) // when all are neginf. (then the whole thing is neginf, but we can pretend)
-          lamax = 0;
-
-        log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax
-          + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_char];
-      } else {
-	// otherwise we just set to neginf
-        if (s < 2*max_target_length+1)
-          log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = neginf;
-      }
-    }
-  }
-  __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch
-
-  // compute the loss (eq (8))
-  if (threadIdx.x == 0) {
-    scalar_t l1 = log_alpha_data[la_batch_offset + la_input_stride * (input_length-1) + la_target_stride * (target_length*2)];
-    scalar_t l2 = log_alpha_data[la_batch_offset + la_input_stride * (input_length-1) + la_target_stride * (target_length*2-1)];
-    scalar_t m = ((l1 > l2) ? l1 : l2);
-    m = ((m == neginf) ? 0 : m);
-    scalar_t log_likelihood = std::log(std::exp(l1-m)+std::exp(l2-m))+m;
-    neg_log_likelihood_data[b] = -log_likelihood;
-  }
-}
-
-// The forward computation. Lot's of admin and a call to the alpha kernel.
-// Note: we do not check that the labels are in the valid range. As we use
-// them for indexing in the kernels, you'll see memory errors when you
-// pass corrupt labels.
-// We support both a 2-dimensional tensor as targets (one set of targets in each row) and
-// a 1-dimensional tensor where all targets are concatenated (and we use target_lengths
-// to figure out where they begin).
-// We return log_alpha (currently, might change to (log_alpha+log_beta) to be passed to the
-// backward. The dispatch function will only return the loss.
-template<typename scalar_t, ScalarType target_scalar_type>
-std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const Tensor& targets_, IntList input_lengths, IntList target_lengths, int64_t BLANK) {
-  // log_probs: input_len x batch_size x num_labels
-  // targets [int64]: batch_size x target_length OR sum(target_lengths)
-  CheckedFrom c = "ctc_loss_gpu";
-  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
-  auto targets = targets_.toType(log_probs.type().toScalarType(target_scalar_type)); // to log_probs cuda if it isn't there already
-  auto log_probs_arg = TensorArg(log_probs, "log_probs", 1);
-  auto targets_arg = TensorArg(targets, "targets", 2);
-  checkAllSameGPU(c, {log_probs_arg, targets_arg});
-
-  checkScalarType(c, targets_arg, target_scalar_type);
-  checkDim(c, log_probs_arg, 3);
-  checkDimRange(c, targets_arg, 1, 3);
-
-  int64_t batch_size = log_probs.size(1);
-  int64_t num_labels = log_probs.size(2);
-  AT_CHECK(BLANK < num_labels, "blank must be in label range");
-  AT_CHECK(input_lengths.size() == batch_size, "input_lengths must be of size batch_size");
-  AT_CHECK(target_lengths.size() == batch_size, "target_lengths must be of size batch_size");
-
-  int64_t lp_input_stride = log_probs.stride(0);
-  int64_t lp_char_stride = log_probs.stride(2);
-  int64_t tg_target_stride;
-
-  int64_t max_target_length;
-  auto tg_batch_offsets = at::empty({batch_size}, TensorOptions(at::CPU(kLong)));
-  auto tg_batch_offsets_data = tg_batch_offsets.data<int64_t>();
-  if (targets.dim() == 1) { // concatenated targets
-    int64_t pos = 0;
-    max_target_length = 0;
-    for (int64_t i = 0; i < batch_size; i++) {
-      tg_batch_offsets_data[i] = pos;
-      pos += target_lengths[i];
-      if (max_target_length < target_lengths[i])
-	max_target_length = target_lengths[i];
-    }
-    tg_target_stride = targets.stride(0);
-    checkSize(c, targets_arg, 0, pos);
-  }
-  else { // batch x max_target_length
-    // dim is 2
-    int64_t tg_batch_stride = targets.stride(0);
-    for (int64_t i = 0; i < batch_size; i++) {
-      tg_batch_offsets_data[i] = i * tg_batch_stride;
-    }
-    tg_target_stride = targets.stride(1);
-    max_target_length = targets.size(1);
-    checkSize(c, targets_arg, 0, batch_size);
-    AT_CHECK(targets.size(1) >= max_target_length,
-             "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg,
-             " (while checking arguments for ", c, ")");
-  }
-  int64_t max_input_length = log_probs.size(0);
-  for (int64_t b = 0; b < batch_size; b++) {
-    AT_CHECK(input_lengths[b] <= max_input_length,
-	     "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
-	     " (while checking arguments for ", c, ")");
-  }
-
-  auto target_lengths_t = at::tensor(target_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong));
-  auto input_lengths_t = at::tensor(input_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong));
-  tg_batch_offsets = tg_batch_offsets.toType(targets.type().toScalarType(kLong));
-
-  Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
-  Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options());
-
-  // Very likely, we could be more clever here, e.g. learning (or genralizing and reusing) from SoftMax.cu...
-  constexpr int max_threads = 1024;
-  int threads_target = max_threads;
-  while (threads_target / 2 >= 2*max_target_length+1) {
-    threads_target /= 2;
-  }
-  int threads_batch = std::min(max_threads / threads_target, (int) batch_size);
-
-  dim3 block(threads_target, threads_batch);
-  dim3 grid((2*max_target_length+1 + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  ctc_loss_log_alpha_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>(
-		      log_alpha.data<scalar_t>(),
-                      log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
-                      targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
-                      neg_log_likelihood.data<scalar_t>(),
-                      log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
-                      log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
-                      tg_batch_offsets.data<int64_t>(), tg_target_stride,
-                      batch_size, BLANK);
-  return std::make_tuple(neg_log_likelihood, log_alpha);
-}
-
-// The second (backward) half of the forward backward algorithm, (10) and (11). This is parallel to the
-// alpha kernel above. (As mentioned above, it might make sense do the calculation in the alpha kernel.)
-template<typename scalar_t, typename target_t>
-__global__ void ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
-                                             const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
-                                             const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
-                                             int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
-                                             int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
-                                             const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
-                                             int64_t batch_size, int64_t BLANK) {
-  constexpr scalar_t neginf = -INFINITY;
-
-  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
-
-  int64_t input_length = input_lengths[b];
-  int64_t target_length = target_lengths[b];
-  int64_t lp_batch_offset = b*lp_batch_stride;
-  int64_t lb_batch_offset = b*lb_batch_stride;
-  int64_t tg_batch_offset = tg_batch_offsets[b];
-
-  if (b >= batch_size)
-    return;
-
-  // "first" row, the beta initiaization before eq (10) (t=target_length - differes per batch)
-  for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
-    int64_t s = threadIdx.x + block_s;
-    scalar_t lb;
-    if (s == 2*target_length) {
-      lb = log_probs_data[lp_batch_offset + (input_length-1) * lp_input_stride + lp_char_stride * BLANK];
-    } else if ((target_length > 0) && (s == 2*target_length-1)) {
-      int64_t current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
-      lb = log_probs_data[lp_batch_offset + (input_length-1) * lp_input_stride + lp_char_stride * current_target_prime];
-    } else {
-      lb = neginf;
-    }
-    if (s < 2*max_target_length+1) {
-      log_beta_data[lb_batch_offset + (input_length-1) * lb_input_stride + lb_target_stride * s] = lb;
-    }
-  }
-
-  // go backward in s
-  for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
-    int64_t s = threadIdx.x + block_s;
-    int64_t current_target_prime;
-    bool have_three;
-    if (s < 2*target_length+1) {
-      current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
-      have_three = ((s < 2*target_length-1) &&
-		    (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) !=
-		     current_target_prime));
-    } else {
-      current_target_prime = BLANK;
-      have_three = false;
-    }
-    // now go backward in t. Note that we need to skip the last timestep that we did above.
-    for (int64_t t=max_input_length-2; t>=0; t--) {
-      __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch item
-      if ((t < input_length-1) && (target_length > 0) && (s < 2*target_length+1)) {
-        scalar_t lb1 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * s];
-        scalar_t lbmax = lb1;
-        scalar_t lb2, lb3;
-
-        if (s < 2*target_length) {
-          lb2 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * (s+1)];
-          if (lb2 > lbmax)
-            lbmax = lb2;
-        } else {
-          lb2 = neginf;
-        }
-        if (have_three) {
-          lb3 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * (s+2)];
-          if (lb3 > lbmax)
-            lbmax = lb3;
-        } else {
-          lb3 = neginf;
-        }
-        if (lbmax == neginf)
-          lbmax = 0;
-
-        scalar_t lb = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax
-          + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_target_prime];
-
-        log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = lb;
-      } else if ((s < 2*max_target_length+1) || (t >= input_length)) {
-          log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = neginf;
-      }
-    }
-  }
-}
-
-// This implements the subtrahend of equation (16) for all *nonblank* characters.
-// It assumes you have probs in gradient_data when called
-// and it modifies gradient_data to be, the gradient.
-// In order to facilitate this inplace update, We don't actually do this in logspace.
-// (The other variant implemented uses log_space and the differences seem to be
-//  not so problematic at least with unit normal distributed test activations.)
-// Internally this uses atomicAdd because different threads may write to the same
-// gradient position.
-// This is parallelised over b and s again.
-// Note that for us, the Z of eqn (16) is actually constant for all t and it is the
-// likelihood - this is why we use the negative log likelihood below.
-// We also multiply by the input gradient to keep with standard autograd style.
-// I took this trick from [2], for moderate alphabet sizes a log-space
-// calculation (with an atomic log add) is similarly in performance, but for large
-// alphabets the inplace nature is a considerable advantage.
-template<typename scalar_t, typename target_t>
-__global__ void ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_data,
-                                                     const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride,
-                                                     const scalar_t* __restrict__ log_alpha_data, const scalar_t* __restrict__ log_beta_data,
-                                                     const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
-                                                     const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
-                                                     const scalar_t* __restrict__ neg_log_likelihood_data,
-                                                     int64_t gr_input_stride, int64_t gr_batch_stride, int64_t gr_char_stride,
-                                                     int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
-                                                     int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
-                                                     int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
-                                                     const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
-                                                     int64_t batch_size, int64_t num_labels, int64_t BLANK) {
-  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
-  int64_t s = threadIdx.x + blockIdx.x * blockDim.y; // note, this directly indexes into targets, no targets prime!
-
-  if (b >= batch_size)
-    return;
-
-  int64_t input_length = input_lengths[b];
-  int64_t target_length = target_lengths[b];
-  int64_t gr_batch_offset = b*gr_batch_stride;
-  int64_t lp_batch_offset = b*lp_batch_stride;
-  int64_t la_batch_offset = b*la_batch_stride;
-  int64_t lb_batch_offset = b*lb_batch_stride;
-  int64_t tg_batch_offset = tg_batch_offsets[b];
-
-  if (s >= target_length)
-    return;
-
-  int64_t target = targets_data[tg_batch_offset + s * tg_target_stride];
-  scalar_t nll = neg_log_likelihood_data[b];
-  scalar_t gr =  grad_out_data[b * grad_out_batch_stride];
-
-  for (int64_t t = 0; t < input_length; t++) {
-    scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * target];
-    atomicAdd(&gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * target],
-	      -std::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)]
-			+ log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)]
-			+ nll - lp) * gr);
-  }
-}
-
-// This is the naive implementation of equation (16). It is parallelised in batch and input timestep.
-// It appears to be faster than the above method for small batch sizes.
-template<typename scalar_t, typename target_t>
-__global__ void ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data,
-                                                     const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride,
-                                                     const scalar_t* __restrict__ log_alpha_data, const scalar_t* __restrict__ log_beta_data,
-                                                     const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
-                                                     const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
-                                                     const scalar_t* __restrict__ neg_log_likelihood_data,
-                                                     int64_t gr_input_stride, int64_t gr_batch_stride, int64_t gr_char_stride,
-                                                     int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
-                                                     int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
-                                                     int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
-                                                     const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
-                                                     int64_t batch_size, int64_t num_labels, int64_t BLANK) {
-
-  constexpr scalar_t neginf = -INFINITY;
-  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
-  int64_t t = threadIdx.x + blockIdx.x * blockDim.x;
-
-  if ((t >= max_input_length) || (b >= batch_size))
-    return;
-
-  int64_t input_length = input_lengths[b];
-  int64_t target_length = target_lengths[b];
-  int64_t gr_batch_offset = b*gr_batch_stride;
-  int64_t lp_batch_offset = b*lp_batch_stride;
-  int64_t la_batch_offset = b*la_batch_stride;
-  int64_t lb_batch_offset = b*lb_batch_stride;
-  int64_t tg_batch_offset = tg_batch_offsets[b];
-
-  // collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s]
-  for (int s = 0; s < 2*max_target_length+1; s++) {
-    if ((target_length > 0) && (s < 2*target_length+1)) {
-      int64_t current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
-      scalar_t log_alpha_beta = (log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s]
-                                 + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s]);
-      scalar_t& lcab = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * current_target_prime];
-      if (lcab == neginf) {
-        lcab = log_alpha_beta;
-      } else {
-        scalar_t max = ((lcab > log_alpha_beta) ? lcab : log_alpha_beta);
-        lcab = std::log(std::exp(lcab-max)+std::exp(log_alpha_beta-max))+max;
-      }
-    }
-  }
-
-  scalar_t nll = neg_log_likelihood_data[b];
-  scalar_t gr =  grad_out_data[b * grad_out_batch_stride];
-
-  for (int64_t c = 0; c < num_labels; c++) {
-    scalar_t& res = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * c];
-    if (t < input_length) {
-      scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * c];
-      res = std::exp(lp)-std::exp(res + nll - lp) * gr;
-    }
-    else {
-      res = 0.;
-    }
-  }
-}
-
-// The backward. It essentially computes eq 16 by using the above kernels.
-// We don't do a lot of checking as we envision this to be called only when backpropagating through a (well-checked) forward.
-template<typename scalar_t, ScalarType target_scalar_type>
-Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets_, IntList input_lengths, IntList target_lengths,
-				      const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) {
-  constexpr scalar_t neginf = -INFINITY;
-  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
-  auto targets = targets_.toType(log_probs.type().toScalarType(target_scalar_type)); // to cuda if it isn't there already
-  int64_t batch_size = log_probs.size(1);
-  int64_t num_labels = log_probs.size(2);
-  int64_t lp_input_stride = log_probs.stride(0);
-  int64_t lp_char_stride = log_probs.stride(2);
-  int64_t tg_target_stride;
-
-  int64_t max_target_length;
-  auto tg_batch_offsets = at::empty({batch_size}, TensorOptions(at::CPU(kLong)));
-  auto tg_batch_offsets_data = tg_batch_offsets.data<int64_t>();
-  if (targets.dim() == 1) { // concatenated targets
-    int64_t pos = 0;
-    max_target_length = 0;
-    for (int64_t i = 0; i < batch_size; i++) {
-      tg_batch_offsets_data[i] = pos;
-      pos += target_lengths[i];
-      if (max_target_length < target_lengths[i])
-	max_target_length = target_lengths[i];
-    }
-    tg_target_stride = targets.stride(0);
-  }
-  else { // batch x max_target_length
-    // dim is 2
-    int64_t tg_batch_stride = targets.stride(0);
-    for (int64_t i = 0; i < batch_size; i++) {
-      tg_batch_offsets_data[i] = i * tg_batch_stride;
-    }
-    tg_target_stride = targets.stride(1);
-    max_target_length = targets.size(1);
-  }
-  auto target_lengths_t = at::tensor(target_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong));
-  auto input_lengths_t = at::tensor(input_lengths, targets.options().device(at::Device(at::Device::Type::CPU)).dtype(kLong)).toType(targets.type().toScalarType(kLong));
-  tg_batch_offsets = tg_batch_offsets.toType(targets.type().toScalarType(kLong));
-
-  Tensor log_beta = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
-  Tensor grad = at::full_like(log_probs, neginf); // initialization for log(sum (alpha beta))
-
-  // As above, there may be better configurations to use.
-  constexpr int max_threads = 1024;
-  int threads_target = max_threads;
-  while (threads_target / 2 >= 2*max_target_length+1) {
-    threads_target /= 2;
-  }
-  int threads_batch = std::min(max_threads / threads_target, (int) batch_size);
-
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  {
-    dim3 block(threads_target, threads_batch);
-    dim3 grid((2*max_target_length+1 + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch);
-
-    ctc_loss_backward_log_beta_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>
-      (log_beta.data<scalar_t>(),
-       log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
-       targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
-       log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
-       log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
-       tg_batch_offsets.data<int64_t>(), tg_target_stride,
-       batch_size, BLANK);
-  }
-
-  // Very crude heuristic for what is a small problem., based on linearly regressing problem dimensions on
-  // the (capped) difference of timings.
-  // Note that for OK problems target length <= input length, so we
-  // only consider input length.
-  bool is_large = (2*log_probs.size(0)+(24*batch_size)/10+(2*num_labels)/10) > 450;
-  if (is_large) { // large alphabet, large batch
-    // this computes the probs, minuend in (16)
-    exp_out(grad, log_probs);
-    // now we compute the subtrahend for the blanks. It is a straightforward reduction because we know that
-    // blanks are in every other position.
-    // maybe we should kernelize this, too.
-    auto grad_blank = grad.narrow(2, BLANK, 1);
-    grad_blank -= (at::logsumexp(log_alpha.as_strided({batch_size, log_alpha.size(1), max_target_length+1},
-						      {log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2})
-				 + log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1},
-						       {log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}),
-				 2, true)
-		   .permute({1, 0, 2})
-		   .add_(neg_log_likelihood.view({1, batch_size, 1}))
-		   .sub_(log_probs.narrow(2, BLANK, 1))
-		   .exp_()
-		   );
-    // Tor the non-blank characters, we use a kernel to compute the subtrahend.
-    // Again we might configure block and grid in a better way.
-    int threads_target = max_threads;
-    while (threads_target / 2 >= max_target_length) {
-      threads_target /= 2;
-    }
-    int threads_batch = std::min(max_threads / threads_target, (int) batch_size);
-    dim3 block(threads_target, threads_batch);
-    dim3 grid((max_target_length + threads_target-1)/threads_target, (batch_size+threads_batch-1)/threads_batch);
-    ctc_loss_backward_collect_nonblank_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>
-      (grad.data<scalar_t>(),
-       grad_out.data<scalar_t>(), grad_out.stride(0),
-       log_alpha.data<scalar_t>(), log_beta.data<scalar_t>(),
-       log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
-       targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
-       neg_log_likelihood.data<scalar_t>(),
-       grad.stride(0), grad.stride(1), grad.stride(2),
-       log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
-       log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
-       log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
-       tg_batch_offsets.data<int64_t>(), tg_target_stride,
-       batch_size, num_labels, BLANK);
-  } else { // small problem, use naive algorithm
-    // Still no block/grid configuration guru...
-    int threads_input = max_threads;
-    while (threads_input / 2 >= log_probs.size(0)) {
-      threads_input /= 2;
-    }
-    threads_batch = std::min(max_threads / threads_input, (int) batch_size);
-    dim3 block(threads_input, threads_batch);
-    dim3 grid((log_probs.size(0) + threads_input-1)/threads_input, (batch_size+threads_batch-1)/threads_batch);
-
-    ctc_loss_backward_collect_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>
-      (grad.data<scalar_t>(),
-       grad_out.data<scalar_t>(), grad_out.stride(0),
-       log_alpha.data<scalar_t>(), log_beta.data<scalar_t>(),
-       log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
-       targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
-       neg_log_likelihood.data<scalar_t>(),
-       grad.stride(0), grad.stride(1), grad.stride(2),
-       log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
-       log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
-       log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
-       tg_batch_offsets.data<int64_t>(), tg_target_stride,
-       batch_size, num_labels, BLANK);
-  }
-  return grad;
-}
-
-} // namespace
-
-std::tuple<Tensor, Tensor> ctc_loss_gpu(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK) {
-  return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss", [&] {
-      if (targets.type().scalarType() == kLong) {
-	return ctc_loss_gpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
-      } else {
-	return ctc_loss_gpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
-      }
-    });
-}
-
-Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths,
-                             const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK) {
-  return AT_DISPATCH_FLOATING_TYPES(log_probs.type(), "ctc_loss_backward", [&] {
-      if (targets.type().scalarType() == kLong) {
-	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
-      } else {
-	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK);
-      }
-    });
-}
-
-} } // at::native
diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu
index 5cde662fba78a6..420733dc558c06 100644
--- a/aten/src/ATen/native/cuda/TensorFactories.cu
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@@ -20,9 +20,17 @@ Tensor& eye_out_cuda(Tensor& result, int64_t n) {
 }
 
 Tensor& eye_out_cuda(Tensor& result, int64_t n, int64_t m) {
+#ifndef USE_TH_SIZE_ZERO_DIM
+  AT_CHECK(n > 0, "n must be greater than 0, got ", n);
+#else
   AT_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
+#endif
 
+#ifndef USE_TH_SIZE_ZERO_DIM
+  if(m <= 0) {
+#else
   if(m < 0) {
+#endif
     m = n;
   }
 
diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu
index f97395d6392ca6..7fa1fe64f28d6f 100644
--- a/aten/src/ATen/native/cuda/TensorTransformations.cu
+++ b/aten/src/ATen/native/cuda/TensorTransformations.cu
@@ -80,7 +80,7 @@ Tensor flip_cuda(const Tensor& self, IntList dims) {
     return out_tensor;
   }
 
-  auto flip_dims = dims.vec();
+  auto flip_dims = std::vector<int64_t>(dims);
   wrap_all_dims(flip_dims, total_dims);
 
   // use kernel_pointwise_flip_apply2 only when to-flip dim is the 1st or last dim, where collapseDims can reduce the amount of work
@@ -99,10 +99,10 @@ Tensor flip_cuda(const Tensor& self, IntList dims) {
 
   auto flip_dims_t = at::CPU(kLong).tensorFromBlob(flip_dims.data(), {static_cast<int64_t>(flip_dims.size())});
 
-  auto shape = in_tensor.sizes().vec();
+  auto shape = std::vector<int64_t>(in_tensor.sizes());
   auto shape_t = at::CPU(kLong).tensorFromBlob(shape.data(), {static_cast<int64_t>(shape.size())});
 
-  auto strides = in_tensor.strides().vec();
+  auto strides = std::vector<int64_t>(in_tensor.strides());
   auto strides_t = at::CPU(kLong).tensorFromBlob(strides.data(), {static_cast<int64_t>(strides.size())});
 
   // stride_contiguous is the stride of non-contiguous tensor after calling contiguous(),
diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp
deleted file mode 100644
index 966aa20e0a128d..00000000000000
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/Config.h>
-#include <ATen/cuda/CUDAConfig.h>
-#if AT_CUDNN_ENABLED()
-  #include <ATen/cudnn/Descriptors.h>
-#endif
-
-
-#if !AT_CUDNN_ENABLED() || (CUDNN_VERSION < 7000)
-
-namespace at { namespace native {
-
-// See Note [ATen preprocessor philosophy]
-
-std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs, const Tensor& targets, IntList input_lengths, IntList target_lengths, int64_t BLANK, bool deterministic) {
-  throw std::runtime_error("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support");
-}
-
-}}
-
-#else // AT_CUDNN_ENABLED
-
-#include <ATen/cudnn/Descriptors.h>
-#include <ATen/cudnn/Types.h>
-#include <ATen/cudnn/Utils.h>
-
-#include <ATen/TensorUtils.h>
-
-namespace at { namespace native {
-
-namespace {
-
-}  // namespace
-
-std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs_t, const Tensor& targets_t, IntList input_lengths_, IntList target_lengths_, int64_t BLANK, bool deterministic) {
-  CheckedFrom c = "cudnn_ctc_loss";
-  TensorArg log_probs { log_probs_t, "log_probs", 1 };
-  TensorArg targets { targets_t, "targets", 2 };
-  checkDim(c, log_probs, 3);
-  checkScalarType(c, log_probs, kFloat);
-  checkDim(c, targets, 1);
-  checkScalarType(c, targets, kInt);
-  checkContiguous(c, targets); // ?
-  checkBackend(c, {*log_probs}, Backend::CUDA);
-  checkBackend(c, {*targets}, Backend::CPU);
-  int64_t batch_size = log_probs->size(1);
-  AT_CHECK(input_lengths_.size() == batch_size, "input_lengths needs to have size to match batch_size");
-  AT_CHECK(target_lengths_.size() == batch_size, "target_lengths needs to have size to match batch_size");
-
-  std::vector<int> input_lengths(input_lengths_.begin(), input_lengths_.end());
-  std::vector<int> target_lengths(target_lengths_.begin(), target_lengths_.end());
-
-  setCuDNNStreamToCurrent();
-  AT_CHECK(BLANK == 0, "blank must be label 0 for cudnn_ctc_loss");
-  // checked in dispatch:
-  // assert other conditions for cudnnCTCLoss: all label lengths <= 256
-  // all input lengths = logprob.size(0)
-
-  auto handle = getCudnnHandle();
-
-  cudnnCTCLossAlgo_t algo = (deterministic ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC);
-
-  Tensor probs = log_probs->softmax(2);
-  TensorDescriptor probs_desc{probs};
-  Tensor grad = at::empty_like(probs);
-  TensorDescriptor grad_desc{grad};
-
-  CTCLossDescriptor ctc_loss_desc;
-  ctc_loss_desc.set(CUDNN_DATA_FLOAT);
-
-  size_t workspace_size;
-  AT_CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(handle, probs_desc.desc(), grad_desc.desc(),
-					      targets->data<int>(), target_lengths.data(), input_lengths.data(),
-					      algo, ctc_loss_desc.desc(), &workspace_size));
-
-
-  Tensor workspace = log_probs->type().toScalarType(kByte).tensor(workspace_size); // new way of doing this with empty?
-  Tensor costs = at::empty({log_probs->size(1)}, log_probs->options());
-
-  AT_CUDNN_CHECK(cudnnCTCLoss(handle, probs_desc.desc(), probs.data_ptr(),
-			      targets->data<int>(), target_lengths.data(), input_lengths.data(),
-			      costs.data_ptr(), grad_desc.desc(), grad.data_ptr(), algo,
-			      ctc_loss_desc.desc(), workspace.data_ptr(), workspace_size));
-
-  return std::make_tuple(costs, grad);
-}
-
-
-}}  // namespace at::native
-
-#endif
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 08e84618e81db3..63f0d7a29578f9 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -166,7 +166,7 @@ namespace {
     std::vector<TensorDescriptor> descriptors(batch_sizes.size());
     size_t i = 0;
     // To be mutated in the loop
-    auto batch_tensor_size = tensor.sizes().vec();
+    std::vector<int64_t> batch_tensor_size(tensor.sizes());
     for (auto batch_size : batch_sizes) {
       batch_tensor_size[0] = batch_size;
       // NB: cuDNN RNN API does not support 2d descriptors, so we
@@ -994,7 +994,7 @@ std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
   if (output_mask[3]) {
     dw = at::native::_cudnn_rnn_backward_weight(input, weight, weight_stride0, weight_buf, hx, cx, output, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve);
   }
-  return std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>>{dx, dhx, dcx, dw};
+  return std::tuple<Tensor, Tensor, Tensor, TensorList>{dx, dhx, dcx, dw};
 }
 
 // TODO: I am not sure if we actually need the 'dropout' and 'train' parameters
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2a8941675d6c9f..8692d6165ff72a 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -29,11 +29,6 @@
 - func: _cast_Half(Tensor self, bool non_blocking=false) -> Tensor
   variants: function, method
 
-- func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank, bool deterministic) -> (Tensor, Tensor)
-  variants: function
-  dispatch:
-    CUDA: _cudnn_ctc_loss
-
 - func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor
   variants: function
   dispatch:
@@ -249,9 +244,6 @@
 - func: blackman_window(int64_t window_length, bool periodic, TensorOptions options={}) -> Tensor
   variants: function
 
-- func: broadcast_tensors(TensorList tensors) -> TensorList
-  variants: function
-
 - func: cat(TensorList tensors, int64_t dim=0) -> Tensor
   variants: function
 
@@ -512,21 +504,6 @@
 - func: cumprod_out(Tensor result, Tensor self, int64_t dim) -> Tensor
   variants: function
 
-- func: ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
-  variants: function
-
-- func: _ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank=0) -> (Tensor, Tensor)
-  variants: function
-  dispatch:
-    CPU:  ctc_loss_cpu
-    CUDA: ctc_loss_gpu
-
-- func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int64_t blank) -> Tensor
-  variants: function
-  dispatch:
-    CPU: ctc_loss_backward_cpu
-    CUDA: ctc_loss_backward_gpu
-
 - func: det(Tensor self) -> Tensor
 
 - func: diagflat(Tensor self, int64_t offset=0) -> Tensor
@@ -738,45 +715,9 @@
   variants: function
   deprecated: true
 
-# NOTE [ grid_sampler Native Functions ]
-# `grid_sampler` does all the shape checking and then dispatches to one of
-# `cudnn_grid_sampler`, `grid_sampler_2d`, or `grid_sampler_3d`, each of which
-# has the corresponding backward defined as native functions as well. Therefore,
-# in these functions and their backwards, no more shape checking is done.
-#
-# Additionally, arguments `padding_mode` and `interpolation_mode` are cast to
-# enums defined in `native/GridSampler.h`. `cudnn_grid_sampler` doesn't take in
-# `interpolation_mode` because it only supports Bilinear interpolation mode.
-#
-# ssnl: Currently `interpolation_mode` is just a placeholder. It is not really
-#       used. Everywhere Bilinear is assumed. I will add Nearest soon.
 - func: grid_sampler(Tensor input, Tensor grid, int64_t padding_mode) -> Tensor
   variants: function
 
-- func: grid_sampler_2d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> Tensor
-  variants: function
-  dispatch:
-    CPU: grid_sampler_2d_cpu
-    CUDA: grid_sampler_2d_cuda
-
-- func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> (Tensor, Tensor)
-  variants: function
-  dispatch:
-    CPU: grid_sampler_2d_backward_cpu
-    CUDA: grid_sampler_2d_backward_cuda
-
-- func: grid_sampler_3d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> Tensor
-  variants: function
-  dispatch:
-    CPU: grid_sampler_3d_cpu
-    CUDA: grid_sampler_3d_cuda
-
-- func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode) -> (Tensor, Tensor)
-  variants: function
-  dispatch:
-    CPU: grid_sampler_3d_backward_cpu
-    CUDA: grid_sampler_3d_backward_cuda
-
 - func: hann_window(int64_t window_length, TensorOptions options={}) -> Tensor
   variants: function
 
@@ -1329,12 +1270,6 @@
 - func: selu_(Tensor self) -> Tensor
   variants: function
 
-- func: celu(Tensor self, Scalar alpha=1.0) -> Tensor
-  variants: function
-
-- func: celu_(Tensor self, Scalar alpha=1.0) -> Tensor
-  variants: function
-
 - func: sigmoid(Tensor self) -> Tensor
 
 - func: sigmoid_(Tensor self) -> Tensor
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 7a7e8be5c7ff6a..0cac9bcb9131fa 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -63,7 +63,7 @@ SparseTensor new_sparse(const SparseType& dtype) {
   AT_ASSERT(!dtype.is_variable());
   AT_ASSERT(dtype.is_sparse());
   // TODO: Hmm... this const_cast business seems a bit dodgy
-  return SparseTensor(new SparseTensorImpl(dtype.backend(), dtype.scalarType()), /* retain */ false);
+  return SparseTensor(new SparseTensorImpl(const_cast<SparseType*>(&dtype)), /* retain */ false);
 }
 
 /*** Helper methods ***/
diff --git a/aten/src/ATen/native/sparse/SparseUtils.h b/aten/src/ATen/native/sparse/SparseUtils.h
index aac948e4940241..226b9084579031 100644
--- a/aten/src/ATen/native/sparse/SparseUtils.h
+++ b/aten/src/ATen/native/sparse/SparseUtils.h
@@ -118,7 +118,7 @@ inline Tensor _new_values_with_size_of(const Tensor& values, int64_t nnz) {
     // That's the assumption this code makes.
     return values.type().tensor({nnz});
   } else {
-    std::vector<int64_t> size = values.sizes().vec();
+    std::vector<int64_t> size = values.sizes();
     size[0] = nnz;
     return values.type().tensor(size);
   }
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
index ff4b0e0c57736c..02b190e4901c55 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu
@@ -81,7 +81,7 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) {
   int64_t newNnz = newEnd.first - indicesIter;
 
   indices1D.resize_({1, newNnz});
-  auto newValues_size = values.sizes().vec();
+  std::vector<int64_t> newValues_size(values.sizes());
   newValues_size[0] = newNnz;
   Tensor newValues = at::empty(newValues_size, values.options());
 
diff --git a/aten/src/ATen/nn.yaml b/aten/src/ATen/nn.yaml
index 8a8a8a5dbe954b..86783e4f76dcd6 100644
--- a/aten/src/ATen/nn.yaml
+++ b/aten/src/ATen/nn.yaml
@@ -58,7 +58,7 @@
 
 # Activation functions
 
-- name: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1)
+- name: elu(Tensor self, Scalar alpha=1, Scalar scale=1)
   cname: ELU
   has_inplace: True
   scalar_check:
@@ -274,3 +274,11 @@
 - name: thnn_conv_dilated3d(Tensor self, Tensor weight, IntList[3] kernel_size, Tensor bias={}, IntList[3] stride=1, IntList[3] padding=0, IntList[3] dilation=1)
   cname: VolumetricDilatedConvolution
   buffers: [columns, ones]
+
+# Vision
+
+- name: thnn_grid_sampler_bilinear2d(Tensor self, Tensor grid, int64_t padding_mode)
+  cname: SpatialGridSamplerBilinear
+
+- name: thnn_grid_sampler_bilinear3d(Tensor self, Tensor grid, int64_t padding_mode)
+  cname: VolumetricGridSamplerBilinear
diff --git a/aten/src/ATen/optional.h b/aten/src/ATen/optional.h
index 0a395bae67cda6..287ddd8577b340 100644
--- a/aten/src/ATen/optional.h
+++ b/aten/src/ATen/optional.h
@@ -1 +1,982 @@
-#include <ATen/core/optional.h>
+// Copyright (C) 2011 - 2012 Andrzej Krzemienski.
+//
+// Use, modification, and distribution is subject to the Boost Software
+// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+//
+// The idea and interface is based on Boost.Optional library
+// authored by Fernando Luis Cacciola Carballal
+//
+// From https://github.com/akrzemi1/Optional
+//
+// ATen:
+// - Move to `at` namespace.
+// - Remove macro use in line 478 because the nvcc device compiler cannot handle it.
+
+#pragma once
+
+# include <utility>
+# include <type_traits>
+# include <initializer_list>
+# include <cassert>
+# include <functional>
+# include <string>
+# include <stdexcept>
+
+# define TR2_OPTIONAL_REQUIRES(...) typename std::enable_if<__VA_ARGS__::value, bool>::type = false
+
+# if defined __GNUC__ // NOTE: GNUC is also defined for Clang
+#   if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)
+#     define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
+#   elif (__GNUC__ > 4)
+#     define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
+#   endif
+#
+#   if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)
+#     define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
+#   elif (__GNUC__ > 4)
+#     define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
+#   endif
+#
+#   if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) && (__GNUC_PATCHLEVEL__ >= 1)
+#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   elif (__GNUC__ == 4) && (__GNUC_MINOR__ >= 9)
+#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   elif (__GNUC__ > 4)
+#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   endif
+# endif
+#
+# if defined __clang_major__
+#   if (__clang_major__ == 3 && __clang_minor__ >= 5)
+#     define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#   elif (__clang_major__ > 3)
+#     define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#   endif
+#   if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#     define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+#   elif (__clang_major__ == 3 && __clang_minor__ == 4 && __clang_patchlevel__ >= 2)
+#     define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+#   endif
+# endif
+#
+# if defined _MSC_VER
+#   if (_MSC_VER >= 1900)
+#     define TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+#   endif
+# endif
+
+# if defined __clang__
+#   if (__clang_major__ > 2) || (__clang_major__ == 2) && (__clang_minor__ >= 9)
+#     define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+#   else
+#     define OPTIONAL_HAS_THIS_RVALUE_REFS 0
+#   endif
+# elif defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+# elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+#   define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+# else
+#   define OPTIONAL_HAS_THIS_RVALUE_REFS 0
+# endif
+
+
+# if defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 1
+#   define OPTIONAL_CONSTEXPR_INIT_LIST constexpr
+# else
+#   define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 0
+#   define OPTIONAL_CONSTEXPR_INIT_LIST
+# endif
+
+# if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ && (defined __cplusplus) && (__cplusplus != 201103L)
+#   define OPTIONAL_HAS_MOVE_ACCESSORS 1
+# else
+#   define OPTIONAL_HAS_MOVE_ACCESSORS 0
+# endif
+
+# // In C++11 constexpr implies const, so we need to make non-const members also non-constexpr
+# if (defined __cplusplus) && (__cplusplus == 201103L)
+#   define OPTIONAL_MUTABLE_CONSTEXPR
+# else
+#   define OPTIONAL_MUTABLE_CONSTEXPR constexpr
+# endif
+
+namespace at {
+
+// 20.5.4, optional for object types
+template <class T> class optional;
+
+// 20.5.5, optional for lvalue reference types
+template <class T> class optional<T&>;
+
+
+// workaround: std utility functions aren't constexpr yet
+template <class T> inline constexpr T&& constexpr_forward(typename std::remove_reference<T>::type& t) noexcept
+{
+  return static_cast<T&&>(t);
+}
+
+template <class T> inline constexpr T&& constexpr_forward(typename std::remove_reference<T>::type&& t) noexcept
+{
+    static_assert(!std::is_lvalue_reference<T>::value, "!!");
+    return static_cast<T&&>(t);
+}
+
+template <class T> inline constexpr typename std::remove_reference<T>::type&& constexpr_move(T&& t) noexcept
+{
+    return static_cast<typename std::remove_reference<T>::type&&>(t);
+}
+
+
+#if defined NDEBUG
+# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) (EXPR)
+#else
+# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) ((CHECK) ? (EXPR) : ([]{assert(!#CHECK);}(), (EXPR)))
+#endif
+
+
+namespace detail_
+{
+
+// static_addressof: a constexpr version of addressof
+template <typename T>
+struct has_overloaded_addressof
+{
+  template <class X>
+  constexpr static bool has_overload(...) { return false; }
+
+  template <class X, size_t S = sizeof(std::declval<X&>().operator&()) >
+  constexpr static bool has_overload(bool) { return true; }
+
+  constexpr static bool value = has_overload<T>(true);
+};
+
+template <typename T, TR2_OPTIONAL_REQUIRES(!has_overloaded_addressof<T>)>
+constexpr T* static_addressof(T& ref)
+{
+  return &ref;
+}
+
+template <typename T, TR2_OPTIONAL_REQUIRES(has_overloaded_addressof<T>)>
+T* static_addressof(T& ref)
+{
+  return std::addressof(ref);
+}
+
+
+// the call to convert<A>(b) has return type A and converts b to type A iff b decltype(b) is implicitly convertible to A
+template <class U>
+constexpr U convert(U v) { return v; }
+
+} // namespace detail
+
+
+constexpr struct trivial_init_t{} trivial_init{};
+
+
+// 20.5.6, In-place construction
+constexpr struct in_place_t{} in_place{};
+
+
+// 20.5.7, Disengaged state indicator
+struct nullopt_t
+{
+  struct init{};
+  constexpr explicit nullopt_t(init){}
+};
+constexpr nullopt_t nullopt{nullopt_t::init()};
+
+
+// 20.5.8, class bad_optional_access
+class bad_optional_access : public std::logic_error {
+public:
+  explicit bad_optional_access(const std::string& what_arg) : logic_error{what_arg} {}
+  explicit bad_optional_access(const char* what_arg) : logic_error{what_arg} {}
+};
+
+
+template <class T>
+union storage_t
+{
+  unsigned char dummy_;
+  T value_;
+
+  constexpr storage_t( trivial_init_t ) noexcept : dummy_() {};
+
+  template <class... Args>
+  constexpr storage_t( Args&&... args ) : value_(constexpr_forward<Args>(args)...) {}
+
+  ~storage_t(){}
+};
+
+
+template <class T>
+union constexpr_storage_t
+{
+    unsigned char dummy_;
+    T value_;
+
+    constexpr constexpr_storage_t( trivial_init_t ) noexcept : dummy_() {};
+
+    template <class... Args>
+    constexpr constexpr_storage_t( Args&&... args ) : value_(constexpr_forward<Args>(args)...) {}
+
+    ~constexpr_storage_t() = default;
+};
+
+
+template <class T>
+struct optional_base
+{
+    bool init_;
+    storage_t<T> storage_;
+
+    constexpr optional_base() noexcept : init_(false), storage_(trivial_init) {};
+
+    explicit constexpr optional_base(const T& v) : init_(true), storage_(v) {}
+
+    explicit constexpr optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {}
+
+    template <class... Args> explicit optional_base(in_place_t, Args&&... args)
+        : init_(true), storage_(constexpr_forward<Args>(args)...) {}
+
+    template <class U, class... Args, TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+    explicit optional_base(in_place_t, std::initializer_list<U> il, Args&&... args)
+        : init_(true), storage_(il, std::forward<Args>(args)...) {}
+
+    ~optional_base() { if (init_) storage_.value_.T::~T(); }
+};
+
+
+template <class T>
+struct constexpr_optional_base
+{
+    bool init_;
+    constexpr_storage_t<T> storage_;
+
+    constexpr constexpr_optional_base() noexcept : init_(false), storage_(trivial_init) {};
+
+    explicit constexpr constexpr_optional_base(const T& v) : init_(true), storage_(v) {}
+
+    explicit constexpr constexpr_optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {}
+
+    template <class... Args> explicit constexpr constexpr_optional_base(in_place_t, Args&&... args)
+      : init_(true), storage_(constexpr_forward<Args>(args)...) {}
+
+    template <class U, class... Args, TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+    OPTIONAL_CONSTEXPR_INIT_LIST explicit constexpr_optional_base(in_place_t, std::initializer_list<U> il, Args&&... args)
+      : init_(true), storage_(il, std::forward<Args>(args)...) {}
+
+    ~constexpr_optional_base() = default;
+};
+
+template <class T>
+using OptionalBase = typename std::conditional<
+    std::is_trivially_destructible<T>::value,                          // if possible
+    constexpr_optional_base<typename std::remove_const<T>::type>, // use base with trivial destructor
+    optional_base<typename std::remove_const<T>::type>
+>::type;
+
+
+
+template <class T>
+class optional : private OptionalBase<T>
+{
+  static_assert( !std::is_same<typename std::decay<T>::type, nullopt_t>::value, "bad T" );
+  static_assert( !std::is_same<typename std::decay<T>::type, in_place_t>::value, "bad T" );
+
+
+  constexpr bool initialized() const noexcept { return OptionalBase<T>::init_; }
+  typename std::remove_const<T>::type* dataptr() {  return std::addressof(OptionalBase<T>::storage_.value_); }
+  constexpr const T* dataptr() const { return detail_::static_addressof(OptionalBase<T>::storage_.value_); }
+
+# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
+  constexpr const T& contained_val() const& { return OptionalBase<T>::storage_.value_; }
+#   if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+  OPTIONAL_MUTABLE_CONSTEXPR T&& contained_val() && { return std::move(OptionalBase<T>::storage_.value_); }
+  OPTIONAL_MUTABLE_CONSTEXPR T& contained_val() & { return OptionalBase<T>::storage_.value_; }
+#   else
+  T& contained_val() & { return OptionalBase<T>::storage_.value_; }
+  T&& contained_val() && { return std::move(OptionalBase<T>::storage_.value_); }
+#   endif
+# else
+  constexpr const T& contained_val() const { return OptionalBase<T>::storage_.value_; }
+  T& contained_val() { return OptionalBase<T>::storage_.value_; }
+# endif
+
+  void clear() noexcept {
+    if (initialized()) dataptr()->T::~T();
+    OptionalBase<T>::init_ = false;
+  }
+
+  template <class... Args>
+  void initialize(Args&&... args) noexcept(noexcept(T(std::forward<Args>(args)...)))
+  {
+    assert(!OptionalBase<T>::init_);
+    ::new (static_cast<void*>(dataptr())) T(std::forward<Args>(args)...);
+    OptionalBase<T>::init_ = true;
+  }
+
+  template <class U, class... Args>
+  void initialize(std::initializer_list<U> il, Args&&... args) noexcept(noexcept(T(il, std::forward<Args>(args)...)))
+  {
+    assert(!OptionalBase<T>::init_);
+    ::new (static_cast<void*>(dataptr())) T(il, std::forward<Args>(args)...);
+    OptionalBase<T>::init_ = true;
+  }
+
+public:
+  typedef T value_type;
+
+  // 20.5.5.1, constructors
+  constexpr optional() noexcept : OptionalBase<T>()  {};
+  constexpr optional(nullopt_t) noexcept : OptionalBase<T>() {};
+
+  optional(const optional& rhs)
+  : OptionalBase<T>()
+  {
+    if (rhs.initialized()) {
+        ::new (static_cast<void*>(dataptr())) T(*rhs);
+        OptionalBase<T>::init_ = true;
+    }
+  }
+
+  optional(optional&& rhs) noexcept(std::is_nothrow_move_constructible<T>::value)
+  : OptionalBase<T>()
+  {
+    if (rhs.initialized()) {
+        ::new (static_cast<void*>(dataptr())) T(std::move(*rhs));
+        OptionalBase<T>::init_ = true;
+    }
+  }
+
+  constexpr optional(const T& v) : OptionalBase<T>(v) {}
+
+  constexpr optional(T&& v) : OptionalBase<T>(constexpr_move(v)) {}
+
+  template <class... Args>
+  explicit constexpr optional(in_place_t, Args&&... args)
+  : OptionalBase<T>(in_place_t{}, constexpr_forward<Args>(args)...) {}
+
+  template <class U, class... Args, TR2_OPTIONAL_REQUIRES(std::is_constructible<T, std::initializer_list<U>>)>
+  OPTIONAL_CONSTEXPR_INIT_LIST explicit optional(in_place_t, std::initializer_list<U> il, Args&&... args)
+  : OptionalBase<T>(in_place_t{}, il, constexpr_forward<Args>(args)...) {}
+
+  // 20.5.4.2, Destructor
+  ~optional() = default;
+
+  // 20.5.4.3, assignment
+  optional& operator=(nullopt_t) noexcept
+  {
+    clear();
+    return *this;
+  }
+
+  optional& operator=(const optional& rhs)
+  {
+    if      (initialized() == true  && rhs.initialized() == false) clear();
+    else if (initialized() == false && rhs.initialized() == true)  initialize(*rhs);
+    else if (initialized() == true  && rhs.initialized() == true)  contained_val() = *rhs;
+    return *this;
+  }
+
+  optional& operator=(optional&& rhs)
+  noexcept(std::is_nothrow_move_assignable<T>::value && std::is_nothrow_move_constructible<T>::value)
+  {
+    if      (initialized() == true  && rhs.initialized() == false) clear();
+    else if (initialized() == false && rhs.initialized() == true)  initialize(std::move(*rhs));
+    else if (initialized() == true  && rhs.initialized() == true)  contained_val() = std::move(*rhs);
+    return *this;
+  }
+
+  template <class U>
+  auto operator=(U&& v)
+  -> typename std::enable_if
+  <
+    std::is_same<typename std::decay<U>::type, T>::value,
+    optional&
+  >::type
+  {
+    if (initialized()) { contained_val() = std::forward<U>(v); }
+    else               { initialize(std::forward<U>(v));  }
+    return *this;
+  }
+
+
+  template <class... Args>
+  void emplace(Args&&... args)
+  {
+    clear();
+    initialize(std::forward<Args>(args)...);
+  }
+
+  template <class U, class... Args>
+  void emplace(std::initializer_list<U> il, Args&&... args)
+  {
+    clear();
+    initialize<U, Args...>(il, std::forward<Args>(args)...);
+  }
+
+  // 20.5.4.4, Swap
+  void swap(optional<T>& rhs) noexcept(std::is_nothrow_move_constructible<T>::value && noexcept(swap(std::declval<T&>(), std::declval<T&>())))
+  {
+    if      (initialized() == true  && rhs.initialized() == false) { rhs.initialize(std::move(**this)); clear(); }
+    else if (initialized() == false && rhs.initialized() == true)  { initialize(std::move(*rhs)); rhs.clear(); }
+    else if (initialized() == true  && rhs.initialized() == true)  { using std::swap; swap(**this, *rhs); }
+  }
+
+  // 20.5.4.5, Observers
+
+  explicit constexpr operator bool() const noexcept { return initialized(); }
+  constexpr bool has_value() const noexcept { return initialized(); }
+
+  constexpr T const* operator ->() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), dataptr());
+  }
+
+# if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+
+  OPTIONAL_MUTABLE_CONSTEXPR T* operator ->() {
+    assert (initialized());
+    return dataptr();
+  }
+
+  constexpr T const& operator *() const& {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T& operator *() & {
+    assert (initialized());
+    return contained_val();
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T&& operator *() && {
+    assert (initialized());
+    return constexpr_move(contained_val());
+  }
+
+  constexpr T const& value() const& {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T& value() & {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T&& value() && {
+    if (!initialized()) throw bad_optional_access("bad optional access");
+	return std::move(contained_val());
+  }
+
+# else
+
+  T* operator ->() {
+    assert (initialized());
+    return dataptr();
+  }
+
+  constexpr T const& operator *() const {
+    return contained_val();
+  }
+
+  T& operator *() {
+    assert (initialized());
+    return contained_val();
+  }
+
+  constexpr T const& value() const {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  T& value() {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+# endif
+
+# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
+
+  template <class V>
+  constexpr T value_or(V&& v) const&
+  {
+    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#   if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+
+  template <class V>
+  OPTIONAL_MUTABLE_CONSTEXPR T value_or(V&& v) &&
+  {
+    return *this ? constexpr_move(const_cast<optional<T>&>(*this).contained_val()) : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#   else
+
+  template <class V>
+  T value_or(V&& v) &&
+  {
+    return *this ? constexpr_move(const_cast<optional<T>&>(*this).contained_val()) : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#   endif
+
+# else
+
+  template <class V>
+  constexpr T value_or(V&& v) const
+  {
+    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+# endif
+
+  // 20.6.3.6, modifiers
+  void reset() noexcept { clear(); }
+};
+
+
+template <class T>
+class optional<T&>
+{
+  static_assert( !std::is_same<T, nullopt_t>::value, "bad T" );
+  static_assert( !std::is_same<T, in_place_t>::value, "bad T" );
+  T* ref;
+
+public:
+
+  // 20.5.5.1, construction/destruction
+  constexpr optional() noexcept : ref(nullptr) {}
+
+  constexpr optional(nullopt_t) noexcept : ref(nullptr) {}
+
+  constexpr optional(T& v) noexcept : ref(detail_::static_addressof(v)) {}
+
+  optional(T&&) = delete;
+
+  constexpr optional(const optional& rhs) noexcept : ref(rhs.ref) {}
+
+  explicit constexpr optional(in_place_t, T& v) noexcept : ref(detail_::static_addressof(v)) {}
+
+  explicit optional(in_place_t, T&&) = delete;
+
+  ~optional() = default;
+
+  // 20.5.5.2, mutation
+  optional& operator=(nullopt_t) noexcept {
+    ref = nullptr;
+    return *this;
+  }
+
+  // optional& operator=(const optional& rhs) noexcept {
+    // ref = rhs.ref;
+    // return *this;
+  // }
+
+  // optional& operator=(optional&& rhs) noexcept {
+    // ref = rhs.ref;
+    // return *this;
+  // }
+
+  template <typename U>
+  auto operator=(U&& rhs) noexcept
+  -> typename std::enable_if
+  <
+    std::is_same<typename std::decay<U>::type, optional<T&>>::value,
+    optional&
+  >::type
+  {
+    ref = rhs.ref;
+    return *this;
+  }
+
+  template <typename U>
+  auto operator=(U&& rhs) noexcept
+  -> typename std::enable_if
+  <
+    !std::is_same<typename std::decay<U>::type, optional<T&>>::value,
+    optional&
+  >::type
+  = delete;
+
+  void emplace(T& v) noexcept {
+    ref = detail_::static_addressof(v);
+  }
+
+  void emplace(T&&) = delete;
+
+
+  void swap(optional<T&>& rhs) noexcept
+  {
+    std::swap(ref, rhs.ref);
+  }
+
+  // 20.5.5.3, observers
+  constexpr T* operator->() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, ref);
+  }
+
+  constexpr T& operator*() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, *ref);
+  }
+
+  constexpr T& value() const {
+    return ref ? *ref : (throw bad_optional_access("bad optional access"), *ref);
+  }
+
+  explicit constexpr operator bool() const noexcept {
+    return ref != nullptr;
+  }
+
+  constexpr bool has_value() const noexcept {
+    return ref != nullptr;
+  }
+
+  template <class V>
+  constexpr typename std::decay<T>::type value_or(V&& v) const
+  {
+    return *this ? **this : detail_::convert<typename std::decay<T>::type>(constexpr_forward<V>(v));
+  }
+
+  // x.x.x.x, modifiers
+  void reset() noexcept { ref = nullptr; }
+};
+
+
+template <class T>
+class optional<T&&>
+{
+  static_assert( sizeof(T) == 0, "optional rvalue references disallowed" );
+};
+
+
+// 20.5.8, Relational operators
+template <class T> constexpr bool operator==(const optional<T>& x, const optional<T>& y)
+{
+  return bool(x) != bool(y) ? false : bool(x) == false ? true : *x == *y;
+}
+
+template <class T> constexpr bool operator!=(const optional<T>& x, const optional<T>& y)
+{
+  return !(x == y);
+}
+
+template <class T> constexpr bool operator<(const optional<T>& x, const optional<T>& y)
+{
+  return (!y) ? false : (!x) ? true : *x < *y;
+}
+
+template <class T> constexpr bool operator>(const optional<T>& x, const optional<T>& y)
+{
+  return (y < x);
+}
+
+template <class T> constexpr bool operator<=(const optional<T>& x, const optional<T>& y)
+{
+  return !(y < x);
+}
+
+template <class T> constexpr bool operator>=(const optional<T>& x, const optional<T>& y)
+{
+  return !(x < y);
+}
+
+
+// 20.5.9, Comparison with nullopt
+template <class T> constexpr bool operator==(const optional<T>& x, nullopt_t) noexcept
+{
+  return (!x);
+}
+
+template <class T> constexpr bool operator==(nullopt_t, const optional<T>& x) noexcept
+{
+  return (!x);
+}
+
+template <class T> constexpr bool operator!=(const optional<T>& x, nullopt_t) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator!=(nullopt_t, const optional<T>& x) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator<(const optional<T>&, nullopt_t) noexcept
+{
+  return false;
+}
+
+template <class T> constexpr bool operator<(nullopt_t, const optional<T>& x) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator<=(const optional<T>& x, nullopt_t) noexcept
+{
+  return (!x);
+}
+
+template <class T> constexpr bool operator<=(nullopt_t, const optional<T>&) noexcept
+{
+  return true;
+}
+
+template <class T> constexpr bool operator>(const optional<T>& x, nullopt_t) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator>(nullopt_t, const optional<T>&) noexcept
+{
+  return false;
+}
+
+template <class T> constexpr bool operator>=(const optional<T>&, nullopt_t) noexcept
+{
+  return true;
+}
+
+template <class T> constexpr bool operator>=(nullopt_t, const optional<T>& x) noexcept
+{
+  return (!x);
+}
+
+
+
+// 20.5.10, Comparison with T
+template <class T> constexpr bool operator==(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x == v : false;
+}
+
+template <class T> constexpr bool operator==(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v == *x : false;
+}
+
+template <class T> constexpr bool operator!=(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x != v : true;
+}
+
+template <class T> constexpr bool operator!=(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v != *x : true;
+}
+
+template <class T> constexpr bool operator<(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x < v : true;
+}
+
+template <class T> constexpr bool operator>(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v > *x : true;
+}
+
+template <class T> constexpr bool operator>(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x > v : false;
+}
+
+template <class T> constexpr bool operator<(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v < *x : false;
+}
+
+template <class T> constexpr bool operator>=(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T> constexpr bool operator<=(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T> constexpr bool operator<=(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T> constexpr bool operator>=(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v >= *x : true;
+}
+
+
+// Comparison of optional<T&> with T
+template <class T> constexpr bool operator==(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x == v : false;
+}
+
+template <class T> constexpr bool operator==(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v == *x : false;
+}
+
+template <class T> constexpr bool operator!=(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x != v : true;
+}
+
+template <class T> constexpr bool operator!=(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v != *x : true;
+}
+
+template <class T> constexpr bool operator<(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x < v : true;
+}
+
+template <class T> constexpr bool operator>(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v > *x : true;
+}
+
+template <class T> constexpr bool operator>(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x > v : false;
+}
+
+template <class T> constexpr bool operator<(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v < *x : false;
+}
+
+template <class T> constexpr bool operator>=(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T> constexpr bool operator<=(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T> constexpr bool operator<=(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T> constexpr bool operator>=(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v >= *x : true;
+}
+
+// Comparison of optional<T const&> with T
+template <class T> constexpr bool operator==(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x == v : false;
+}
+
+template <class T> constexpr bool operator==(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v == *x : false;
+}
+
+template <class T> constexpr bool operator!=(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x != v : true;
+}
+
+template <class T> constexpr bool operator!=(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v != *x : true;
+}
+
+template <class T> constexpr bool operator<(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x < v : true;
+}
+
+template <class T> constexpr bool operator>(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v > *x : true;
+}
+
+template <class T> constexpr bool operator>(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x > v : false;
+}
+
+template <class T> constexpr bool operator<(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v < *x : false;
+}
+
+template <class T> constexpr bool operator>=(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T> constexpr bool operator<=(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T> constexpr bool operator<=(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T> constexpr bool operator>=(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v >= *x : true;
+}
+
+
+// 20.5.12, Specialized algorithms
+template <class T>
+void swap(optional<T>& x, optional<T>& y) noexcept(noexcept(x.swap(y)))
+{
+  x.swap(y);
+}
+
+
+template <class T>
+constexpr optional<typename std::decay<T>::type> make_optional(T&& v)
+{
+  return optional<typename std::decay<T>::type>(constexpr_forward<T>(v));
+}
+
+template <class X>
+constexpr optional<X&> make_optional(std::reference_wrapper<X> v)
+{
+  return optional<X&>(v.get());
+}
+
+
+} // namespace at
+
+namespace std
+{
+  template <typename T>
+  struct hash<at::optional<T>>
+  {
+    typedef typename hash<T>::result_type result_type;
+    typedef at::optional<T> argument_type;
+
+    constexpr result_type operator()(argument_type const& arg) const {
+      return arg ? std::hash<T>{}(*arg) : result_type{};
+    }
+  };
+
+  template <typename T>
+  struct hash<at::optional<T&>>
+  {
+    typedef typename hash<T>::result_type result_type;
+    typedef at::optional<T&> argument_type;
+
+    constexpr result_type operator()(argument_type const& arg) const {
+      return arg ? std::hash<T>{}(*arg) : result_type{};
+    }
+  };
+}
+
+# undef TR2_OPTIONAL_REQUIRES
+# undef TR2_OPTIONAL_ASSERTED_EXPRESSION
diff --git a/aten/src/ATen/templates/StorageDerived.cpp b/aten/src/ATen/templates/StorageDerived.cpp
new file mode 100644
index 00000000000000..0491203c3286e6
--- /dev/null
+++ b/aten/src/ATen/templates/StorageDerived.cpp
@@ -0,0 +1,69 @@
+#include "ATen/${Storage}.h"
+
+// ${generated_comment}
+
+#include "ATen/Half.h"
+#include "ATen/Allocator.h"
+#include <ATen/Context.h>
+
+#include "ATen/Config.h"
+$extra_cuda_headers
+
+namespace at {
+
+${Storage}::${Storage}()
+  : Storage(new StorageImpl(
+      ScalarType::${ScalarName}, 
+      0,
+#if ${isCUDA}
+      globalContext().getTHCState()->cudaDeviceAllocator,
+#else
+      getTHDefaultAllocator(),
+#endif
+      /* resizable */ true)) {}
+
+${Storage}::${Storage}(size_t size)
+  : Storage(new StorageImpl(
+      ScalarType::${ScalarName}, 
+      size,
+#if ${isCUDA}
+      globalContext().getTHCState()->cudaDeviceAllocator,
+#else
+      getTHDefaultAllocator(),
+#endif
+      /* resizable */ true)) {}
+
+${Storage}::${Storage}(size_t size, Allocator* allocator)
+  : Storage(new StorageImpl(
+      ScalarType::${ScalarName}, 
+      size,
+      allocator,
+      /* resizable */ false)) {}
+
+// TODO: Take in Device as an input to the std::function constructor
+
+#if ${isCUDA}
+static int getPointerDevice(void* ptr) {
+  struct cudaPointerAttributes attr;
+  THCudaCheck(cudaPointerGetAttributes(&attr, ptr));
+  return attr.device;
+}
+#endif
+
+${Storage}::${Storage}(
+  void * data, 
+  size_t size, 
+  const std::function<void(void*)> & deleter)
+  : Storage(new StorageImpl(
+      ScalarType::${ScalarName},
+      size,
+      InefficientStdFunctionContext::makeDataPtr(data, deleter,
+#if ${isCUDA}
+      Device(kCUDA, getPointerDevice(data))
+#else
+      kCPU
+#endif
+       ),
+     /* allocator */ nullptr,
+     /* resizable */ false)) {}
+}
diff --git a/aten/src/ATen/templates/StorageDerived.h b/aten/src/ATen/templates/StorageDerived.h
new file mode 100644
index 00000000000000..dddcd5dbf03f21
--- /dev/null
+++ b/aten/src/ATen/templates/StorageDerived.h
@@ -0,0 +1,31 @@
+#pragma once
+
+// ${generated_comment}
+
+$th_headers
+
+#include "ATen/Storage.h"
+#include "ATen/Context.h"
+
+#include <memory>
+
+namespace at {
+
+struct Allocator;
+
+struct ${Storage} final : public Storage {
+  ${Storage}();
+  ${Storage}(StorageImpl* storage_impl) : Storage(storage_impl){};
+  ${Storage}(size_t size);
+  ${Storage}(size_t size, Allocator* allocator);
+  ${Storage}(
+      void* data,
+      size_t size,
+      const std::function<void(void*)>& deleter);
+  StorageImpl* storage_impl_;
+
+ protected:
+  friend struct ${Type};
+};
+
+} // namespace at
diff --git a/aten/src/ATen/templates/Tensor.h b/aten/src/ATen/templates/Tensor.h
index 55fb4aec0cbb60..31e952ebb79ff8 100644
--- a/aten/src/ATen/templates/Tensor.h
+++ b/aten/src/ATen/templates/Tensor.h
@@ -2,6 +2,7 @@
 
 // ${generated_comment}
 
+#include "ATen/Generator.h"
 #include "ATen/Scalar.h"
 #include "ATen/ScalarType.h"
 #include "ATen/SparseTensorRef.h"
@@ -9,12 +10,12 @@
 #include "ATen/TensorAccessor.h"
 #include "ATen/TensorBase.h"
 #include "ATen/TensorImpl.h"
+#include "ATen/Utils.h"
 #include "ATen/Device.h"
 #include "ATen/Layout.h"
 #include "ATen/optional.h"
 
 namespace at {
-struct Generator;
 struct Type;
 struct Tensor;
 struct TensorOptions;
diff --git a/aten/src/ATen/templates/TensorDense.cpp b/aten/src/ATen/templates/TensorDense.cpp
index aeba9fb22a3653..cc2f47a89180ab 100644
--- a/aten/src/ATen/templates/TensorDense.cpp
+++ b/aten/src/ATen/templates/TensorDense.cpp
@@ -3,5 +3,5 @@
 std::unique_ptr<Storage> ${Tensor}::storage() {
   auto storage = THTensor_getStoragePtr(tensor);
   THStorage_retain(storage);
-  return std::unique_ptr<Storage>(new Storage(storage));
+  return std::unique_ptr<Storage>(new ${Storage}(storage));
 }
diff --git a/aten/src/ATen/templates/TensorDerived.cpp b/aten/src/ATen/templates/TensorDerived.cpp
index 5fab8bf2226417..d72ba4abde2c12 100644
--- a/aten/src/ATen/templates/TensorDerived.cpp
+++ b/aten/src/ATen/templates/TensorDerived.cpp
@@ -5,8 +5,9 @@
 
 // ${generated_comment}
 
+#include "ATen/Config.h"
 #include "ATen/${Tensor}.h"
-#include "ATen/Storage.h"
+#include "ATen/${Storage}.h"
 #include "ATen/Scalar.h"
 #include "ATen/Half.h"
 
@@ -21,7 +22,7 @@ namespace detail {
 }
 
 ${Tensor}::${Tensor}(${THTensor} * tensor)
-: TensorImpl(Backend::${Backend}, ScalarType::${ScalarName}, tensor, /* is variable */ false)
+: TensorImpl(&globalContext().getType(Backend::${Backend},ScalarType::${ScalarName}), tensor)
 {}
 
 ${TensorDenseOrSparse}
diff --git a/aten/src/ATen/templates/TypeDerived.cpp b/aten/src/ATen/templates/TypeDerived.cpp
index ddd1483f0436f3..67009473dddefc 100644
--- a/aten/src/ATen/templates/TypeDerived.cpp
+++ b/aten/src/ATen/templates/TypeDerived.cpp
@@ -31,14 +31,6 @@
 
 namespace at {
 
-#if ${isCUDA}
-static int getPointerDevice(void* ptr) {
-  struct cudaPointerAttributes attr;
-  THCudaCheck(cudaPointerGetAttributes(&attr, ptr));
-  return attr.device;
-}
-#endif
-
 ${Type}::${Type}(Context* context)
   : Type(context, /*is_variable=*/false, /*is_undefined=*/false) {}
 ScalarType ${Type}::scalarType() const {
@@ -52,44 +44,18 @@ bool ${Type}::is_sparse() const { return backend() == kSparseCPU || backend() ==
 bool ${Type}::is_distributed() const { return false; }
 
 std::unique_ptr<Storage> ${Type}::storage() const {
-  return std::unique_ptr<Storage>(new Storage(
-      ScalarType::${ScalarName},
-      0,
-#if ${isCUDA}
-      globalContext().getTHCState()->cudaDeviceAllocator
-#else
-      getTHDefaultAllocator()
-#endif
-  ));
+  return std::unique_ptr<Storage>(new ${Storage}());
 }
 std::unique_ptr<Storage> ${Type}::storage(size_t size) const {
-  return std::unique_ptr<Storage>(new Storage(
-      ScalarType::${ScalarName},
-      size,
-#if ${isCUDA}
-      globalContext().getTHCState()->cudaDeviceAllocator
-#else
-      getTHDefaultAllocator()
-#endif
-  ));
+  return std::unique_ptr<Storage>(new ${Storage}(size));
 }
 std::unique_ptr<Storage> ${Type}::storageFromBlob(void * data, int64_t size, const std::function<void(void*)> & deleter) const {
     return std::unique_ptr<Storage>(
-      new Storage(
-      ScalarType::${ScalarName},
-      InefficientStdFunctionContext::makeDataPtr(data, deleter,
-#if ${isCUDA}
-      Device(kCUDA, getPointerDevice(data))
-#else
-      kCPU
-#endif
-      ),
-      size,
-      deleter));
+      new ${Storage}(data,size,deleter));
 }
 std::unique_ptr<Storage> ${Type}::storageWithAllocator(int64_t size, Allocator* allocator) const {
     return std::unique_ptr<Storage>(
-        new Storage(ScalarType::${ScalarName}, size, allocator));
+        new ${Storage}(size, allocator));
 }
 Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const {
   if (retain)
@@ -99,7 +65,7 @@ Tensor ${Type}::unsafeTensorFromTH(void * th_pointer, bool retain) const {
 std::unique_ptr<Storage> ${Type}::unsafeStorageFromTH(void * th_pointer, bool retain) const {
   if (retain)
     ${THStorage}_retain(${state,} (${THStorage}*) th_pointer);
-  return std::unique_ptr<Storage>(new Storage((${THStorage}*) th_pointer));
+  return std::unique_ptr<Storage>(new ${Storage}((${THStorage}*) th_pointer));
 }
 std::unique_ptr<Generator> ${Type}::generator() const {
   return std::unique_ptr<Generator>(new ${Generator}(context));
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 8e58df97073086..6b46c8c0b70018 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -270,10 +270,6 @@ static void test(Type & type) {
     auto result = tensor.m(relu).m(mse_loss, other, Reduction::ElementwiseMean);
     REQUIRE(result.allclose(mse_loss(relu(tensor), other)));
   }
-  SECTION("core") {
-    int i = CoreTest();
-    REQUIRE(i + 1 == CoreTest());
-  }
 }
 
 TEST_CASE( "basic tests CPU", "[cpu]" ) {
diff --git a/aten/src/ATen/test/scalar_tensor_test.cpp b/aten/src/ATen/test/scalar_tensor_test.cpp
index 4a400e3a517ee6..64098c5bf76c56 100644
--- a/aten/src/ATen/test/scalar_tensor_test.cpp
+++ b/aten/src/ATen/test/scalar_tensor_test.cpp
@@ -65,13 +65,30 @@ void test(Type &T) {
     require_equal_size_dim(t2, ones({0}, T));
 
     // unsqueeze
+#ifndef USE_TH_SIZE_ZERO_DIM
+    if (t.numel() != 0) {
+      REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1);
+    } else {
+      REQUIRE_THROWS(t.unsqueeze(0));
+    }
+#else
     REQUIRE(t.unsqueeze(0).dim() == t.dim() + 1);
+#endif
 
     // unsqueeze_
     {
       auto t2 = ones(*s, T);
+#ifndef USE_TH_SIZE_ZERO_DIM
+      if (t2.numel() != 0) {
+        auto r = t2.unsqueeze_(0);
+        REQUIRE(r.dim() == t.dim() + 1);
+      } else {
+        REQUIRE_THROWS(t2.unsqueeze_(0));
+      }
+#else
       auto r = t2.unsqueeze_(0);
       REQUIRE(r.dim() == t.dim() + 1);
+#endif
     }
 
     // squeeze (with dimension argument)
diff --git a/aten/src/TH/THHalf.cpp b/aten/src/TH/THHalf.cpp
index 840c97617c4cb2..1c46c59a9977fa 100644
--- a/aten/src/TH/THHalf.cpp
+++ b/aten/src/TH/THHalf.cpp
@@ -1,5 +1,4 @@
 #include "THHalf.h"
-#include <ATen/Half.h>
 
 /* Copyright 1993-2014 NVIDIA Corporation.  All rights reserved. */
 
@@ -17,14 +16,85 @@ TH_API float TH_half2float(THHalf h)
   return f;
 }
 
+// Host functions for converting between FP32 and FP16 formats
 
 void TH_halfbits2float(unsigned short* src, float* res)
 {
-  *res = at::detail::halfbits2float(*src);
-}
+    unsigned h = *src;
+    unsigned sign = ((h >> 15) & 1);
+    unsigned exponent = ((h >> 10) & 0x1f);
+    unsigned mantissa = ((h & 0x3ff) << 13);
+
+    if (exponent == 0x1f) {  /* NaN or Inf */
+        mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+        exponent = 0xff;
+    } else if (!exponent) {  /* Denorm or Zero */
+        if (mantissa) {
+            unsigned int msb;
+            exponent = 0x71;
+            do {
+                msb = (mantissa & 0x400000);
+                mantissa <<= 1;  /* normalize */
+                --exponent;
+            } while (!msb);
+            mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70;
+    }
 
+    *(unsigned*)res = ((sign << 31) | (exponent << 23) | mantissa);
+}
 
 void TH_float2halfbits(float* src, unsigned short* dest)
 {
-  *dest = at::detail::float2halfbits(*src);
+    unsigned x = *(unsigned*)src;
+    unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
+    unsigned sign, exponent, mantissa;
+
+    // Get rid of +NaN/-NaN case first.
+    if (u > 0x7f800000) {
+      *dest = 0x7fffU;
+      return ;
+    }
+  
+    sign = ((x >> 16) & 0x8000);
+  
+    // Get rid of +Inf/-Inf, +0/-0.
+    if (u > 0x477fefff) {
+      *dest = sign | 0x7c00U;
+      return; 
+    }
+    if (u < 0x33000001) {
+      *dest = (sign | 0x0000);
+      return;
+    }
+
+    exponent = ((u >> 23) & 0xff);
+    mantissa = (u & 0x7fffff);
+
+    if (exponent > 0x70) {
+        shift = 13;
+        exponent -= 0x70;
+    } else {
+        shift = 0x7e - exponent;
+        exponent = 0;
+        mantissa |= 0x800000;
+    }
+    lsb = (1 << shift);
+    lsb_s1 = (lsb >> 1);
+    lsb_m1 = (lsb - 1);
+  
+    // Round to nearest even.
+    remainder = (mantissa & lsb_m1);
+    mantissa >>= shift;
+    if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+        ++mantissa;
+        if (!(mantissa & 0x3ff)) {
+            ++exponent;
+            mantissa = 0;
+        }
+    }  
+
+    *dest = (sign | (exponent << 10) | mantissa);  
 }
diff --git a/aten/src/TH/THStorageFunctions.cpp b/aten/src/TH/THStorageFunctions.cpp
index 0c36d5bf97fcf0..0f05bb466651d3 100644
--- a/aten/src/TH/THStorageFunctions.cpp
+++ b/aten/src/TH/THStorageFunctions.cpp
@@ -19,25 +19,38 @@ void THStorage_free(THStorage* storage) {
   if (!storage) {
     return;
   }
-  storage->release();
+
+  if (--storage->refcount == 0) {
+    if (storage->finalizer) {
+      (*storage->finalizer)();
+    }
+    storage->finalizer = nullptr;
+    storage->data_ptr.clear();
+    THStorage_weakFree(storage);
+  }
 }
 
 // Manually retains a weak reference
 void THStorage_weakRetain(THStorage *weak_storage) {
-  weak_storage->weak_retain();
+  weak_storage->weakcount++;
 }
 
 // Releases a weak reference
 void THStorage_weakFree(THStorage *weak_storage) {
-  weak_storage->weak_release();
+  if (--weak_storage->weakcount == 0) {
+    delete weak_storage;
+  }
 }
 
 // Given a weak reference, returns a strong reference to a storage (which must
 // be freed when done) or null if the storage is already dead.
 THStorage* THStorage_weakLock(THStorage *weak_storage) {
-  if (weak_storage->weak_lock())
-    return weak_storage;
-  return nullptr;
+  for (;;) {
+    int refcount = weak_storage->refcount.load();
+    if (refcount == 0) return nullptr;
+    if (weak_storage->refcount.compare_exchange_strong(refcount, refcount + 1)) break;
+  }
+  return weak_storage;
 }
 
 THDescBuff THLongStorage_sizeDesc(const THLongStorage *size) {
@@ -82,7 +95,7 @@ ptrdiff_t THStorage_size(const THStorage *self)
 void THStorage_retain(THStorage *storage)
 {
   if (storage) {
-    storage->retain();
+    ++storage->refcount;
   }
 }
 
diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp
index 0e8b3e4ab17bee..671e2f39fb1c7e 100644
--- a/aten/src/TH/THStorageFunctions.hpp
+++ b/aten/src/TH/THStorageFunctions.hpp
@@ -35,6 +35,8 @@
 
 TH_API ptrdiff_t THStorage_size(const THStorage *self);
 
+TH_API void THStorage_setFlag(THStorage *storage, const char flag);
+TH_API void THStorage_clearFlag(THStorage *storage, const char flag);
 TH_API void THStorage_retain(THStorage *storage);
 TH_API void THStorage_resize(THStorage *storage, ptrdiff_t size);
 TH_API void THStorage_swap(THStorage *storage1, THStorage *storage2);
diff --git a/aten/src/TH/THTensor.cpp b/aten/src/TH/THTensor.cpp
index 5f3b6ed1fef6cc..13df5128e5f5f8 100644
--- a/aten/src/TH/THTensor.cpp
+++ b/aten/src/TH/THTensor.cpp
@@ -32,7 +32,7 @@ THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride, at::IntList
   // This could perhaps be combined with the below code, but the complexity didn't seem worth it.
   int64_t numel = std::accumulate(oldshape.begin(), oldshape.end(), 1, std::multiplies<int64_t>());
   if (numel == 0 && oldshape.equals(newshape)) {
-    return oldstride.vec();
+    return std::vector<int64_t>(oldstride);
   }
 
   std::vector<int64_t> newstride(newshape.size());
diff --git a/aten/src/TH/THTensor.hpp b/aten/src/TH/THTensor.hpp
index 56204a00e9c3ed..16329f7ed7f621 100644
--- a/aten/src/TH/THTensor.hpp
+++ b/aten/src/TH/THTensor.hpp
@@ -56,10 +56,6 @@ struct THTensor
       return sizes_.size();
     }
 
-    at::ScalarType scalar_type() const {
-      return storage_->scalar_type;
-    }
-
     ptrdiff_t storage_offset() const {
       return storage_offset_;
     }
@@ -113,17 +109,6 @@ inline int64_t* THTensor_getStridePtr(THTensor* tensor) {
 
 // NB: Non-retaining
 inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) {
-  // Within PyTorch, the invariant is that storage_ is always
-  // initialized; we never have tensors that don't have any storage.
-  // However, for Caffe2, this is not true, because they have permitted
-  // tensors to be allocated without specifying what scalar type
-  // they should be, only to be filled when GetMutableData is called
-  // for the first time (providing the necessary type).  It is an ERROR to
-  // invoke any PyTorch operations on such a half-constructed storage,
-  // and this check tests for that case.
-  AT_CHECK(tensor->storage_, "Cannot use PyTorch operations on a half-constructed "
-           "tensor.  If this tensor came from Caffe2, please call GetMutableData on "
-           "it first; otherwise, this is a bug, please report it.");
   return tensor->storage_;
 }
 
@@ -133,7 +118,6 @@ inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) {
 inline void THTensor_resizeDim(THTensor* tensor, int64_t ndim) {
   // NB: This is *truly* a resize; calling code (e.g., squeeze)
   // assumes that old values are preserved
-  tensor->is_zero_dim_ = bool(ndim == 0);
   tensor->sizes_.resize(ndim);
   tensor->strides_.resize(ndim);
 }
@@ -157,9 +141,6 @@ inline void THTensor_setStorageOffset(THTensor* tensor, ptrdiff_t storage_offset
 
 // NB: Steals ownership of storage
 inline void THTensor_stealAndSetStoragePtr(THTensor* tensor, THStorage* storage) {
-  // Caffe2 might have tensors whose storages are null, but we
-  // don't allow it in PyTorch.
-  AT_ASSERT(storage);
   tensor->storage_ = storage;
 }
 
@@ -196,19 +177,6 @@ inline int THTensor_nDimensionLegacyAll(const THTensor* tensor) {
   }
 }
 
-inline int64_t THTensor_strideLegacyNoScalars(const THTensor *self, int dim) {
-  THArgCheck((dim >= 0) && (dim < THTensor_nDimensionLegacyNoScalars(self)), 2, "dimension %d out of range of %dD tensor",
-      dim+TH_INDEX_BASE, THTensor_nDimensionLegacyNoScalars(self));
-  return THTensor_isZeroDim(self) ? 1 : self->stride(dim);
-}
-
-inline int64_t THTensor_sizeLegacyNoScalars(const THTensor *self, int dim)
-{
-  THArgCheck((dim >= 0) && (dim < THTensor_nDimensionLegacyNoScalars(self)), 2, "dimension %d out of range of %dD tensor",
-      dim+TH_INDEX_BASE, THTensor_nDimensionLegacyNoScalars(self));
-  return THTensor_isZeroDim(self) ? 1 : self->size(dim);
-}
-
 TH_API void THTensor_free(THTensor *self);
 TH_CPP_API at::optional<std::vector<int64_t>> THTensor_compute_stride(at::IntList oldshape, at::IntList oldstride,
                                                                       at::IntList newshape);
diff --git a/aten/src/TH/THTensorDimApply.h b/aten/src/TH/THTensorDimApply.h
index ff05ed8194979d..00c24dee51adb8 100644
--- a/aten/src/TH/THTensorDimApply.h
+++ b/aten/src/TH/THTensorDimApply.h
@@ -39,8 +39,8 @@
   int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \
   int TH_TENSOR_DIM_APPLY_i; \
 \
-  if( (DIMENSION < 0) || (DIMENSION >= THTensor_nDimensionLegacyNoScalars(TENSOR1)) ) \
-    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, THTensor_nDimensionLegacyNoScalars(TENSOR1)); \
+  if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \
+    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->dim()); \
   int same_dims = 1;                                                    \
   if( TENSOR1->dim() != TENSOR2->dim() ) {                    \
     same_dims = 0;                                                      \
@@ -56,8 +56,8 @@
   if (TH_TENSOR_DIM_APPLY_hasFinished) { \
     return; \
   } \
-  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(THTensor_nDimensionLegacyNoScalars(TENSOR1))); \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
+  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
     TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
 \
   TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+(TENSOR1)->storage_offset(); \
@@ -76,14 +76,14 @@
   { \
     CODE \
 \
-    if(THTensor_nDimensionLegacyNoScalars(TENSOR1) == 1) \
+    if(TENSOR1->dim() == 1) \
        break; \
  \
-    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
+    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
     { \
       if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
       { \
-        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
         { \
           TH_TENSOR_DIM_APPLY_hasFinished = 1; \
           break; \
@@ -98,7 +98,7 @@
 \
       if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size(TH_TENSOR_DIM_APPLY_i)) \
       { \
-        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
         { \
           TH_TENSOR_DIM_APPLY_hasFinished = 1; \
           break; \
@@ -145,13 +145,13 @@
   int TH_TENSOR_DIM_APPLY_hasFinished = THTensor_(numel)(TENSOR1) == 0; \
   int TH_TENSOR_DIM_APPLY_i; \
 \
-  if( (DIMENSION < 0) || (DIMENSION >= THTensor_nDimensionLegacyNoScalars(TENSOR1)) ) \
+  if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->dim()) ) \
     THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, THTensor_nDimensionLegacyAll(TENSOR1)); \
   if( TENSOR1->dim() != TENSOR2->dim() ) {                    \
     AT_ERROR("inconsistent tensor size, expected ", #TENSOR1, " ", TENSOR1->sizes(), " and ", #TENSOR2, " ", TENSOR2->sizes(), " to have the same number of dimensions");        \
   }                                                                     \
   TH_UNUSED int shape_check_flag = 0;                                             \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
   { \
     if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
       continue; \
@@ -163,8 +163,8 @@
   if (TH_TENSOR_DIM_APPLY_hasFinished) { \
     return; \
   } \
-  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(THTensor_nDimensionLegacyNoScalars(TENSOR1))); \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < THTensor_nDimensionLegacyNoScalars(TENSOR1); TH_TENSOR_DIM_APPLY_i++) \
+  TH_TENSOR_DIM_APPLY_counter = (int64_t*)THAlloc(sizeof(int64_t)*(TENSOR1->dim())); \
+  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
     TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
 \
   TENSOR1##_data = THTensor_getStoragePtr(TENSOR1)->data<TYPE1>()+(TENSOR1)->storage_offset(); \
@@ -179,14 +179,14 @@
   { \
     CODE \
 \
-    if(THTensor_nDimensionLegacyNoScalars(TENSOR1) == 1) \
+    if(TENSOR1->dim() == 1) \
        break; \
  \
     for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->dim(); TH_TENSOR_DIM_APPLY_i++) \
     { \
       if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
       { \
-        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
         { \
           TH_TENSOR_DIM_APPLY_hasFinished = 1; \
           break; \
@@ -200,7 +200,7 @@
 \
       if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size(TH_TENSOR_DIM_APPLY_i)) \
       { \
-        if(TH_TENSOR_DIM_APPLY_i == THTensor_nDimensionLegacyNoScalars(TENSOR1)-1) \
+        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->dim()-1) \
         { \
           TH_TENSOR_DIM_APPLY_hasFinished = 1; \
           break; \
diff --git a/aten/src/TH/generic/THTensor.cpp b/aten/src/TH/generic/THTensor.cpp
index e68c60a9455c4f..58a5d39366c294 100644
--- a/aten/src/TH/generic/THTensor.cpp
+++ b/aten/src/TH/generic/THTensor.cpp
@@ -373,7 +373,11 @@ void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, int64_t fir
 
   THArgCheck( (dimension >= 0) && (dimension < src->dim()), 2, "out of range");
   THArgCheck( firstIndex >= 0, 3, "out of range");
+#ifdef USE_TH_SIZE_ZERO_DIM
   THArgCheck( size >= 0, 4, "out of range");
+#else
+  THArgCheck( size > 0, 4, "out of range");
+#endif
   THArgCheck(firstIndex <= src->size(dimension) - size, 4, "out of range");
 
   THTensor_(set)(self, src);
@@ -392,8 +396,12 @@ void THTensor_(select)(THTensor *self, THTensor *src, int dimension, int64_t sli
   if(!src)
     src = self;
 
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(THTensor_nDimensionLegacyAll(src) > 1, 1, "cannot select on a vector");
+#else
 #ifndef USE_TH_SCALAR
   THArgCheck(src->dim() > 1, 1, "cannot select on a vector");
+#endif
 #endif
   THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range");
   THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size(dimension)), 3, "out of range");
@@ -415,8 +423,8 @@ void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dim
   if(!src)
     src = self;
 
-  THArgCheck( (dimension1 >= 0) && (dimension1 < THTensor_nDimensionLegacyNoScalars(src)), 1, "out of range");
-  THArgCheck( (dimension2 >= 0) && (dimension2 < THTensor_nDimensionLegacyNoScalars(src)), 2, "out of range");
+  THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range");
+  THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range");
 
   THTensor_(set)(self, src);
 
@@ -438,7 +446,10 @@ void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t siz
   if(!src)
     src = self;
 
-  THArgCheck((dimension >= 0) && (dimension < THTensor_nDimensionLegacyNoScalars(src)), 2, "out of range");
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor");
+#endif
+  THArgCheck((dimension >= 0) && (dimension < src->dim()), 2, "out of range");
   THArgCheck(size <= src->size(dimension), 3, "out of range");
   THArgCheck(step > 0, 4, "invalid step");
 
@@ -448,20 +459,18 @@ void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, int64_t siz
   std::vector<int64_t> newStride(/* size */ self->dim()+1);
 
   newSize[self->dim()] = size;
-  newStride[self->dim()] = THTensor_strideLegacyNoScalars(self, dimension);
+  newStride[self->dim()] = self->stride(dimension);
   for(d = 0; d < self->dim(); d++)
   {
-    auto self_size = THTensor_sizeLegacyNoScalars(self, d);
-    auto self_stride = THTensor_strideLegacyNoScalars(self, d);
     if(d == dimension)
     {
-      newSize[d] = (self_size - size) / step + 1;
-      newStride[d] = step*self_stride;
+      newSize[d] = (self->size(d) - size) / step + 1;
+      newStride[d] = step*self->stride(d);
     }
     else
     {
-      newSize[d] = self_size;
-      newStride[d] = self_stride;
+      newSize[d] = self->size(d);
+      newStride[d] = self->stride(d);
     }
   }
 
@@ -538,6 +547,9 @@ void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension)
     src = self;
 
   THArgCheck((dimension >= 0) && (dimension <= src->dim()), 2, "dimension out of range");
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(!src->is_empty(), 2, "cannot unsqueeze empty tensor");
+#endif
 
   THTensor_(set)(self, src);
 
@@ -716,6 +728,15 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t
 
   for(d = 0; d < nDimension; d++)
   {
+#ifndef USE_TH_SIZE_ZERO_DIM
+    // we can't support this unless we have arbitrary 0-sized dimensions, but some calls to this
+    // currently exist and expect a size [0] tensor to be returned.
+    if (d == 0 && size[d] == 0) {
+      nDimension = 1;
+    } else {
+      AT_CHECK(size[d] > 0, "sizes must be non-negative");
+    }
+#endif
     if((self->dim() > d) && (size[d] != self->size(d))) {
       hascorrectsize = false;
     }
@@ -769,14 +790,14 @@ void THTensor_(resizeNd)(THTensor *self, int nDimension, int64_t *size, int64_t
 
 void THTensor_(set1d)(THTensor *tensor, int64_t x0, real value)
 {
-  THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension");
+  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 1, 1, "tensor must have one dimension");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
   THStorage_(set)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0), value);
 }
 
 real THTensor_(get1d)(const THTensor *tensor, int64_t x0)
 {
-  THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension");
+  THArgCheck(THTensor_nDimensionLegacyAll(tensor) == 1, 1, "tensor must have one dimension");
   THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
   return THStorage_(get)(THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0));
 }
diff --git a/aten/src/TH/generic/THTensorEvenMoreMath.cpp b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
index 03946724dcadc6..644fa541a8f9ae 100644
--- a/aten/src/TH/generic/THTensorEvenMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
@@ -149,8 +149,15 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
   int64_t *index_data;
   real *tensor_data, *src_data;
 
-  THArgCheck(THTensor_nDimensionLegacyNoScalars(index) == 1, 3, "Index is supposed to be 1-dimensional");
-  THArgCheck(dim < THTensor_nDimensionLegacyNoScalars(src), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(THTensor_nDimensionLegacyAll(index) <= 1, 3, "Index is supposed to be an empty tensor or a vector");
+  THArgCheck(dim < THTensor_nDimensionLegacyAll(src), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+  THArgCheck(THTensor_nDimensionLegacyAll(src) > 0, 2, "Source tensor is empty");
+#else
+  THArgCheck(index->dim() == 1, 3, "Index is supposed to be 1-dimensional");
+  THArgCheck(dim < src->dim(), 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+  //THArgCheck(src->dim() > 0, 2, "Source tensor is empty");
+#endif
 
   numel = THLongTensor_nElement(index);
 
@@ -181,7 +188,7 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
       }
     }
 
-    if (src->dim() <= 1) {
+    if (src->dim() == 1) {
       #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i)
       for (i=0; i<numel; i++)
         tensor_data[i] = src_data[index_data[i] - TH_INDEX_BASE];
@@ -191,7 +198,7 @@ void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTens
         memcpy(tensor_data + i*rowsize, src_data + (index_data[i] - TH_INDEX_BASE)*rowsize, rowsize*sizeof(real));
     }
   }
-  else if (src->dim() <= 1)
+  else if (src->dim() == 1)
   {
     for (i=0; i<numel; i++)
       THTensor_(set1d)(tensor,i,THTensor_(get1d)(src,index_data[i] - TH_INDEX_BASE));
@@ -347,9 +354,14 @@ void THTensor_(indexAdd)(THTensor *tensor, int dim, THLongTensor *index, THTenso
   int64_t *index_data;
 
   numel = THLongTensor_nElement(index);
-  THArgCheck(THTensor_nDimensionLegacyNoScalars(index) == 1, 3, "Index is supposed to be a vector");
-  THArgCheck(dim < THTensor_nDimensionLegacyNoScalars(src), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
-  THArgCheck(numel == THTensor_sizeLegacyNoScalars(src, dim),4,"Number of indices should be equal to source:size(dim)");
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(THTensor_nDimensionLegacyAll(index) == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < THTensor_nDimensionLegacyAll(src), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+#else
+  THArgCheck(index->dim() == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < src->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+#endif
+  THArgCheck(numel == src->size(dim),4,"Number of indices should be equal to source:size(dim)");
 
   index = THLongTensor_newContiguous(index);
   index_data = THLongTensor_data(index);
@@ -388,8 +400,13 @@ void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real v
   int64_t *index_data;
 
   numel = THLongTensor_nElement(index);
-  THArgCheck(THTensor_nDimensionLegacyNoScalars(index) == 1, 3, "Index is supposed to be a vector");
-  THArgCheck(dim < THTensor_nDimensionLegacyNoScalars(tensor), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(THTensor_nDimensionLegacyAll(index) == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < THTensor_nDimensionLegacyAll(tensor), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+#else
+  THArgCheck(index->dim() == 1, 3, "Index is supposed to be a vector");
+  THArgCheck(dim < tensor->dim(), 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
+#endif
 
   index = THLongTensor_newContiguous(index);
   index_data = THLongTensor_data(index);
@@ -442,11 +459,19 @@ void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor
 {
   int64_t elems_per_row, i, idx;
 
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(dim < THTensor_(nDimensionLegacyAll)(tensor), 2, "Index dimension is out of bounds");
+  THArgCheck(THLongTensor_nDimensionLegacyAll(index) == THTensor_(nDimensionLegacyAll)(tensor), 3,
+             "Index tensor must have same dimensions as output tensor");
+  THArgCheck(THTensor_(nDimensionLegacyAll)(src) == THTensor_(nDimensionLegacyAll)(tensor), 4,
+             "Input tensor must have same dimensions as output tensor");
+#else
   THArgCheck(dim < THTensor_(nDimensionLegacyNoScalars)(tensor), 2, "Index dimension is out of bounds");
   THArgCheck(THLongTensor_nDimensionLegacyNoScalars(index) == THTensor_(nDimensionLegacyNoScalars)(tensor), 3,
              "Index tensor must have same dimensions as output tensor");
   THArgCheck(THTensor_(nDimensionLegacyNoScalars)(src) == THTensor_(nDimensionLegacyNoScalars)(tensor), 4,
              "Input tensor must have same dimensions as output tensor");
+#endif
 
   elems_per_row = THLongTensor_size(index, dim);
 
diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp
index 24d9a7e8c4ea07..c521d1da750a43 100644
--- a/aten/src/TH/generic/THTensorMath.cpp
+++ b/aten/src/TH/generic/THTensorMath.cpp
@@ -805,11 +805,11 @@ void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, T
 
 void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat, THTensor *vec)
 {
-  if( (mat->dim() != 2) || (THTensor_nDimensionLegacyNoScalars(vec) != 1) )
+  if( (mat->dim() != 2) || (vec->dim() != 1) )
     THError("matrix and vector expected, got %dD, %dD",
-      mat->dim(), THTensor_nDimensionLegacyNoScalars(vec));
+      mat->dim(), vec->dim());
 
-  if( mat->size(1) != THTensor_sizeLegacyNoScalars(vec, 0) ) {
+  if( mat->size(1) != vec->size(0) ) {
     THDescBuff bm = THTensor_(sizeDesc)(mat);
     THDescBuff bv = THTensor_(sizeDesc)(vec);
     THError("size mismatch, %s, %s", bm.str, bv.str);
@@ -837,14 +837,14 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
   {
     THBlas_(gemv)('n', mat->size(0), mat->size(1),
                   alpha, THTensor_(data)(mat), mat->stride(1),
-                  THTensor_(data)(vec), THTensor_strideLegacyNoScalars(vec, 0),
+                  THTensor_(data)(vec), vec->stride(0),
                   beta, THTensor_(data)(r_), r_->stride(0));
   }
   else if(mat->stride(1) == 1 && LDA_COND(mat->size(1), mat->size(0), mat->stride(0)))
   {
     THBlas_(gemv)('t',  mat->size(1), mat->size(0),
                   alpha, THTensor_(data)(mat), mat->stride(0),
-                  THTensor_(data)(vec), THTensor_strideLegacyNoScalars(vec, 0),
+                  THTensor_(data)(vec), vec->stride(0),
                   beta, THTensor_(data)(r_), r_->stride(0));
   }
   else
@@ -853,7 +853,7 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
 
     THBlas_(gemv)('t',  mat->size(1), mat->size(0),
                   alpha, THTensor_(data)(cmat), cmat->stride(0),
-                  THTensor_(data)(vec), THTensor_strideLegacyNoScalars(vec, 0),
+                  THTensor_(data)(vec), vec->stride(0),
                   beta, THTensor_(data)(r_), r_->stride(0));
 
     THTensor_(free)(cmat);
@@ -861,7 +861,7 @@ void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
 
   // In gemv (x,0).mv(0) does not
   // handle beta, whereas gemm does for case where (x,0).mm(0,y).
-  if (THTensor_sizeLegacyNoScalars(vec, 0) == 0 && mat->size(0) != 0) {
+  if (vec->size(0) == 0 && mat->size(0) != 0) {
     if (beta == 0) {
       THTensor_(zero)(r_);
     } else if (beta != 1) {
@@ -1058,19 +1058,14 @@ void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
 
 void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *vec1, THTensor *vec2)
 {
-  if( (THTensor_nDimensionLegacyNoScalars(vec1) != 1) || (THTensor_nDimensionLegacyNoScalars(vec2) != 1) )
+  if( (vec1->dim() != 1) || (vec2->dim() != 1) )
     THError("vector and vector expected, got %dD, %dD tensors",
-        THTensor_nDimensionLegacyNoScalars(vec1), THTensor_nDimensionLegacyNoScalars(vec2));
+        vec1->dim(), vec2->dim());
 
   if(t->dim() != 2)
     THError("expected matrix, got %dD tensor for t", t->dim());
 
-  auto vec1_size = THTensor_sizeLegacyNoScalars(vec1, 0);
-  auto vec2_size = THTensor_sizeLegacyNoScalars(vec2, 0);
-  auto vec1_stride = THTensor_strideLegacyNoScalars(vec1, 0);
-  auto vec2_stride = THTensor_strideLegacyNoScalars(vec2, 0);
-
-  if( (t->size(0) != vec1_size) || (t->size(1) != vec2_size) ) {
+  if( (t->size(0) != vec1->size(0)) || (t->size(1) != vec2->size(0)) ) {
     THDescBuff bt  = THTensor_(sizeDesc)(t);
     THDescBuff bv1 = THTensor_(sizeDesc)(vec1);
     THDescBuff bv2 = THTensor_(sizeDesc)(vec2);
@@ -1092,27 +1087,27 @@ void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor
   // n == 1 || lda >= max(1, m)
   #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M)))
 
-  if(r_->stride(0) == 1 && LDA_COND(vec1_size, vec2_size, r_->stride(1)))
+  if(r_->stride(0) == 1 && LDA_COND(vec1->size(0), vec2->size(0), r_->stride(1)))
   {
-    THBlas_(ger)(vec1_size, vec2_size,
-                 alpha, THTensor_(data)(vec1), vec1_stride,
-                 THTensor_(data)(vec2), vec2_stride,
+    THBlas_(ger)(vec1->size(0), vec2->size(0),
+                 alpha, THTensor_(data)(vec1), vec1->stride(0),
+                 THTensor_(data)(vec2), vec2->stride(0),
                  THTensor_(data)(r_), r_->stride(1));
   }
-  else if(r_->stride(1) == 1 && LDA_COND(vec2->size(0), vec1_size, r_->stride(0)))
+  else if(r_->stride(1) == 1 && LDA_COND(vec2->size(0), vec1->size(0), r_->stride(0)))
   {
-    THBlas_(ger)(vec2_size, vec1_size,
-                 alpha, THTensor_(data)(vec2), vec2_stride,
-                 THTensor_(data)(vec1), vec1_stride,
+    THBlas_(ger)(vec2->size(0), vec1->size(0),
+                 alpha, THTensor_(data)(vec2), vec2->stride(0),
+                 THTensor_(data)(vec1), vec1->stride(0),
                  THTensor_(data)(r_), r_->stride(0));
   }
   else
   {
     THTensor *cr = THTensor_(newClone)(r_);
 
-    THBlas_(ger)(vec2_size, vec1_size,
-                 alpha, THTensor_(data)(vec2), vec2_stride,
-                 THTensor_(data)(vec1), vec1_stride,
+    THBlas_(ger)(vec2->size(0), vec1->size(0),
+                 alpha, THTensor_(data)(vec2), vec2->stride(0),
+                 THTensor_(data)(vec1), vec1->stride(0),
                  THTensor_(data)(cr), cr->stride(0));
 
     THTensor_(freeCopyTo)(cr, r_);
diff --git a/aten/src/TH/generic/THTensorMoreMath.cpp b/aten/src/TH/generic/THTensorMoreMath.cpp
index fa8fb0558661ea..d06ec255644cce 100644
--- a/aten/src/TH/generic/THTensorMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorMoreMath.cpp
@@ -557,6 +557,9 @@ void THTensor_(onesLike)(THTensor *r_, THTensor *input)
 
 void THTensor_(diag)(THTensor *r_, THTensor *t, int k)
 {
+#ifndef USE_TH_SIZE_ZERO_DIM
+  AT_ASSERT(!t->is_empty())
+#endif
   THArgCheck(THTensor_(nDimensionLegacyNoScalars)(t) == 1 || THTensor_(nDimensionLegacyNoScalars)(t) == 2, 1, "matrix or a vector expected");
 
   if(THTensor_(nDimensionLegacyNoScalars)(t) == 1)
@@ -1183,11 +1186,19 @@ void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, i
 
 void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int64_t k, int dim, int dir, int sorted)
 {
+#ifndef USE_TH_SIZE_ZERO_DIM
+  int numDims = THTensor_(nDimensionLegacyAll)(t);
+#else
   int numDims = THTensor_(nDimensionLegacyNoScalars)(t);
+#endif
   THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range");
 
   int64_t sliceSize = THTensor_(size)(t, dim);
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension");
+#else
   THArgCheck(k >= 0 && k <= sliceSize, 2, "k not in range for dimension");
+#endif
 
   THTensor *tmpResults = THTensor_(new)();
   THTensor_(resize1d)(tmpResults, sliceSize);
diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp
index a8fb33c11a5bd4..9df36f097ba6ee 100644
--- a/aten/src/THC/THCTensor.cpp
+++ b/aten/src/THC/THCTensor.cpp
@@ -10,7 +10,7 @@
 #include "THCTensorInfo.cuh"
 
 int THCTensor_nDimensionLegacyNoScalars(THCState *state, const THCTensor *self) {
-  return THTensor_nDimensionLegacyNoScalars(self);
+  return self->dim();
 }
 
 int THCTensor_nDimensionLegacyAll(THCState *state, const THCTensor *self) {
@@ -99,6 +99,15 @@ void THCTensor_resizeNd(THCState *state, THCTensor *self, int nDimension, int64_
 
   for(d = 0; d < nDimension; d++)
   {
+#ifndef USE_TH_SIZE_ZERO_DIM
+    // we can't support this unless we have arbitrary 0-sized dimensions, but some calls to this
+    // currently exist and expect a size [0] tensor to be returned.
+    if (d == 0 && size[d] == 0) {
+      nDimension = 1;
+    } else {
+      AT_CHECK(size[d] > 0, "sizes must be non-negative");
+    }
+#endif
     if((self->dim() > d) && (size[d] != self->size(d))) {
       hascorrectsize = false;
     }
@@ -225,6 +234,9 @@ void THCTensor_unsqueeze1d(THCState *state, THCTensor *self, THCTensor *src, int
     src = self;
 
   THArgCheck((dimension >= 0) && (dimension <= src->dim()), 3, "dimension out of range");
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(!src->is_empty(), 3, "cannot unsqueeze empty tensor");
+#endif
 
   THCTensor_set(state, self, src);
 
diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp
index 940af6eb86ead4..e15ba5e5a2c666 100644
--- a/aten/src/THC/generic/THCTensor.cpp
+++ b/aten/src/THC/generic/THCTensor.cpp
@@ -28,21 +28,11 @@ int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim)
   return THCTensor_size(state, self, dim);
 }
 
-int64_t THCTensor_(sizeLegacyNoScalars)(THCState *state, const THCTensor *self, int dim)
-{
-  return THTensor_sizeLegacyNoScalars(self, dim);
-}
-
 int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim)
 {
   return THCTensor_stride(state, self, dim);
 }
 
-int64_t THCTensor_(strideLegacyNoScalars)(THCState *state, const THCTensor *self, int dim)
-{
-  return THTensor_strideLegacyNoScalars(self, dim);
-}
-
 THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self)
 {
   return THCTensor_newSizeOf(state, self);
@@ -377,7 +367,11 @@ void THCTensor_(narrow)(THCState *state, THCTensor *self, THCTensor *src, int di
 
   THArgCheck( (dimension >= 0) && (dimension < src->dim()), 3, "out of range");
   THArgCheck( firstIndex >= 0, 4, "out of range");
+#ifdef USE_TH_SIZE_ZERO_DIM
   THArgCheck( size >= 0, 5, "out of range");
+#else
+  THArgCheck( size > 0, 5, "out of range");
+#endif
   THArgCheck(firstIndex+size <= src->size(dimension), 5, "out of range");
 
   THCTensor_(set)(state, self, src);
@@ -396,8 +390,12 @@ void THCTensor_(select)(THCState *state, THCTensor *self, THCTensor *src, int di
   if(!src)
     src = self;
 
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(THTensor_nDimensionLegacyAll(src) > 1, 1, "cannot select on a vector");
+#else
 #ifndef USE_TH_SCALAR
   THArgCheck(src->dim() > 1, 1, "cannot select on a vector");
+#endif
 #endif
   THArgCheck((dimension >= 0) && (dimension < src->dim()), 3, "out of range");
   THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size(dimension)), 4, "out of range");
@@ -419,8 +417,8 @@ void THCTensor_(transpose)(THCState *state, THCTensor *self, THCTensor *src, int
   if(!src)
     src = self;
 
-  THArgCheck( (dimension1 >= 0) && (dimension1 < THTensor_nDimensionLegacyNoScalars(src)), 1, "out of range");
-  THArgCheck( (dimension2 >= 0) && (dimension2 < THTensor_nDimensionLegacyNoScalars(src)), 2, "out of range");
+  THArgCheck( (dimension1 >= 0) && (dimension1 < src->dim()), 1, "out of range");
+  THArgCheck( (dimension2 >= 0) && (dimension2 < src->dim()), 2, "out of range");
 
   THCTensor_(set)(state, self, src);
 
@@ -442,8 +440,11 @@ void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int di
   if(!src)
     src = self;
 
-  THArgCheck(dimension < THTensor_nDimensionLegacyNoScalars(src), 2, "out of range");
-  THArgCheck(size <= THTensor_sizeLegacyNoScalars(src, dimension), 3, "out of range");
+#ifndef USE_TH_SIZE_ZERO_DIM
+  THArgCheck(!src->is_empty(), 1, "cannot unfold an empty tensor");
+#endif
+  THArgCheck(dimension < src->dim(), 2, "out of range");
+  THArgCheck(size <= src->size(dimension), 3, "out of range");
   THArgCheck(step > 0, 4, "invalid step");
 
   THCTensor_(set)(state, self, src);
@@ -452,20 +453,18 @@ void THCTensor_(unfold)(THCState *state, THCTensor *self, THCTensor *src, int di
   std::vector<int64_t> newStride(self->dim() + 1);
 
   newSize[self->dim()] = size;
-  newStride[self->dim()] = THTensor_strideLegacyNoScalars(self, dimension);
+  newStride[self->dim()] = self->stride(dimension);
   for(d = 0; d < self->dim(); d++)
   {
-    auto self_size = THTensor_sizeLegacyNoScalars(self, d);
-    auto self_stride = THTensor_strideLegacyNoScalars(self, d);
     if(d == dimension)
     {
-      newSize[d] = (self_size - size) / step + 1;
-      newStride[d] = step*self_stride;
+      newSize[d] = (self->size(d) - size) / step + 1;
+      newStride[d] = step*self->stride(d);
     }
     else
     {
-      newSize[d] = self_size;
-      newStride[d] = self_stride;
+      newSize[d] = self->size(d);
+      newStride[d] = self->stride(d);
     }
   }
 
@@ -604,15 +603,15 @@ void THCTensor_(resizeNd)(THCState *state, THCTensor *self, int nDimension, int6
 
 void THCTensor_(set1d)(THCState *state, THCTensor *tensor, int64_t x0, real value)
 {
-  THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension");
-  THArgCheck( (x0 >= 0) && (x0 < THTensor_sizeLegacyNoScalars(tensor, 0)), 2, "out of range");
+  THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
   THCStorage_(set)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0), value);
 }
 
 real THCTensor_(get1d)(THCState *state, const THCTensor *tensor, int64_t x0)
 {
-  THArgCheck(THTensor_nDimensionLegacyNoScalars(tensor) == 1, 1, "tensor must have one dimension");
-  THArgCheck( (x0 >= 0) && (x0 < THTensor_sizeLegacyNoScalars(tensor, 0)), 2, "out of range");
+  THArgCheck(tensor->dim() == 1, 1, "tensor must have one dimension");
+  THArgCheck( (x0 >= 0) && (x0 < tensor->size(0)), 2, "out of range");
   return THCStorage_(get)(state, THTensor_getStoragePtr(tensor), tensor->storage_offset()+x0*tensor->stride(0));
 }
 
diff --git a/aten/src/THC/generic/THCTensor.h b/aten/src/THC/generic/THCTensor.h
index 2ee1bf11a4be4c..dbb1591ae194f2 100644
--- a/aten/src/THC/generic/THCTensor.h
+++ b/aten/src/THC/generic/THCTensor.h
@@ -26,9 +26,7 @@ THC_API int THCTensor_(nDimensionLegacyNoScalars)(THCState *state, const THCTens
 THC_API int THCTensor_(nDimensionLegacyAll)(THCState *state, const THCTensor *self);
 
 THC_API int64_t THCTensor_(size)(THCState *state, const THCTensor *self, int dim);
-THC_API int64_t THCTensor_(sizeLegacyNoScalars)(THCState *state, const THCTensor *self, int dim);
 THC_API int64_t THCTensor_(stride)(THCState *state, const THCTensor *self, int dim);
-THC_API int64_t THCTensor_(strideLegacyNoScalars)(THCState *state, const THCTensor *self, int dim);
 THC_API THLongStorage *THCTensor_(newSizeOf)(THCState *state, THCTensor *self);
 THC_API THLongStorage *THCTensor_(newStrideOf)(THCState *state, THCTensor *self);
 THC_API real *THCTensor_(data)(THCState *state, const THCTensor *self);
diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu
index 82f56f9946e471..4cbf5dd224abe5 100644
--- a/aten/src/THC/generic/THCTensorIndex.cu
+++ b/aten/src/THC/generic/THCTensorIndex.cu
@@ -537,6 +537,16 @@ void THCTensor_(indexSelect)(THCState *state, THCTensor *dst, THCTensor *src, in
 
   THLongStorage *newSize;
 
+#ifndef USE_TH_SIZE_ZERO_DIM
+  if (numIndices == 0) {
+    newSize = THCTensor_(newSizeOf)(state, src);
+    THLongStorage_set(newSize, 0, numIndices);
+    THCTensor_(resize)(state, dst, newSize, NULL);
+    THLongStorage_free(newSize);
+    return;
+  }
+#endif
+
   newSize = THCTensor_(newSizeOf)(state, src);
   THLongStorage_set(newSize, dim, numIndices);
   THCTensor_(resize)(state, dst, newSize, NULL);
diff --git a/aten/src/THC/generic/THCTensorMath.cu b/aten/src/THC/generic/THCTensorMath.cu
index cc1a8c9ba57e41..642b14aec48cfd 100644
--- a/aten/src/THC/generic/THCTensorMath.cu
+++ b/aten/src/THC/generic/THCTensorMath.cu
@@ -330,6 +330,9 @@ void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor,
 void THCTensor_(diag)(THCState *state, THCTensor *self_, THCTensor *src_, int64_t k){
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, self_, src_));
   int nDimension = THCTensor_(nDimensionLegacyNoScalars)(state, src_);
+#ifndef USE_TH_SIZE_ZERO_DIM
+  AT_ASSERT(!src_->is_empty());
+#endif
   THArgCheck((nDimension == 2) || (nDimension == 1), 1, "expected a matrix or a vector");
   if (nDimension == 2) {
     int64_t stride0 = THCTensor_(stride)(state, src_, 0);
diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu
index 591780b04edf75..17ef020e85f8ee 100644
--- a/aten/src/THC/generic/THCTensorMathBlas.cu
+++ b/aten/src/THC/generic/THCTensorMathBlas.cu
@@ -49,15 +49,11 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec));
-  if( (mat->dim() != 2) || (THTensor_nDimensionLegacyNoScalars(vec) != 1) )
+  if( (mat->dim() != 2) || (vec->dim() != 1) )
     THError("2D tensor and 1D tensor expected, got %dD, %dD tensors",
-       mat->dim(), THTensor_nDimensionLegacyNoScalars(vec));
+       mat->dim(), vec->dim());
 
-
-  auto vec_size = THTensor_sizeLegacyNoScalars(vec, 0);
-  auto vec_stride = THTensor_strideLegacyNoScalars(vec, 0);
-
-  if( mat->size(1) != THTensor_sizeLegacyNoScalars(vec, 0) )
+  if( mat->size(1) != vec->size(0) )
     THError("size mismatch");
 
   if(t->dim() != 1)
@@ -78,12 +74,12 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
 #ifdef THC_REAL_IS_FLOAT
     THCudaBlas_Sgemv(state, 'n', mat->size(0), mat->size(1),
                     alpha, THCTensor_(data)(state, mat), mat->stride(1),
-                    THCTensor_(data)(state, vec), vec_stride,
+                    THCTensor_(data)(state, vec), vec->stride(0),
                     beta, THCTensor_(data)(state, r_), r_->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
     THCudaBlas_Dgemv(state, 'n', mat->size(0), mat->size(1),
                     alpha, THCTensor_(data)(state, mat), mat->stride(1),
-                    THCTensor_(data)(state, vec), vec_stride,
+                    THCTensor_(data)(state, vec), vec->stride(0),
                     beta, THCTensor_(data)(state, r_), r_->stride(0));
 #endif
   }
@@ -92,12 +88,12 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
 #ifdef THC_REAL_IS_FLOAT
     THCudaBlas_Sgemv(state, 't',  mat->size(1), mat->size(0),
                     alpha, THCTensor_(data)(state, mat), mat->stride(0),
-                    THCTensor_(data)(state, vec), vec_stride,
+                    THCTensor_(data)(state, vec), vec->stride(0),
                     beta, THCTensor_(data)(state, r_), r_->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
     THCudaBlas_Dgemv(state, 't',  mat->size(1), mat->size(0),
                      alpha, THCTensor_(data)(state, mat), mat->stride(0),
-                     THCTensor_(data)(state, vec), vec_stride,
+                     THCTensor_(data)(state, vec), vec->stride(0),
                      beta, THCTensor_(data)(state, r_), r_->stride(0));
 #endif
   }
@@ -108,12 +104,12 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
 #ifdef THC_REAL_IS_FLOAT
     THCudaBlas_Sgemv(state, 't',  mat->size(1), mat->size(0),
                     alpha, THCTensor_(data)(state, cmat), cmat->stride(0),
-                    THCTensor_(data)(state, vec), vec_stride,
+                    THCTensor_(data)(state, vec), vec->stride(0),
                     beta, THCTensor_(data)(state, r_), r_->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
     THCudaBlas_Dgemv(state, 't',  mat->size(1), mat->size(0),
                     alpha, THCTensor_(data)(state, cmat), cmat->stride(0),
-                    THCTensor_(data)(state, vec), vec_stride,
+                    THCTensor_(data)(state, vec), vec->stride(0),
                     beta, THCTensor_(data)(state, r_), r_->stride(0));
 #endif
 
@@ -133,7 +129,7 @@ THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real
 #elif defined(THC_REAL_IS_HALF)
     // Currently no Hgemv/SgemvEx in Cublas
     THCTensor *vecAsMatrix = THCTensor_(newWithTensor)(state, vec);
-    THCTensor_(resize2d)(state, vecAsMatrix, vec_size, 1);
+    THCTensor_(resize2d)(state, vecAsMatrix, vecAsMatrix->size(0), 1);
 
     THCTensor *tAsMatrix = THCTensor_(newWithTensor)(state, t);
     THCTensor_(resize2d)(state, tAsMatrix, tAsMatrix->size(0), 1);
@@ -155,20 +151,16 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a
 {
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2));
-  if ( (THTensor_nDimensionLegacyNoScalars(vec1) != 1) || (THTensor_nDimensionLegacyNoScalars(vec2) != 1) ) {
+  if ( (vec1->dim() != 1) || (vec2->dim() != 1) ) {
     THError("1D tensors expected, got %dD, %dD tensors",
-       THTensor_nDimensionLegacyNoScalars(vec1), THTensor_nDimensionLegacyNoScalars(vec2));
+       vec1->dim(), vec2->dim());
   }
-  auto vec1_size = THTensor_sizeLegacyNoScalars(vec1, 0);
-  auto vec2_size = THTensor_sizeLegacyNoScalars(vec2, 0);
-  auto vec1_stride = THTensor_strideLegacyNoScalars(vec1, 0);
-  auto vec2_stride = THTensor_strideLegacyNoScalars(vec2, 0);
 
   if (t->dim() != 2) {
     THError("size mismatch");
   }
 
-  if ( (t->size(0) != vec1_size) || (t->size(1) != vec2_size) ) {
+  if ( (t->size(0) != vec1->size(0)) || (t->size(1) != vec2->size(0)) ) {
     THError("size mismatch");
   }
 
@@ -187,28 +179,28 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a
   if(r_->stride(0) == 1)
   {
 #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sger(state, vec1_size, vec2_size,
-                   alpha, THCTensor_(data)(state, vec1), vec1_stride,
-                   THCTensor_(data)(state, vec2), vec2_stride,
+    THCudaBlas_Sger(state, vec1->size(0), vec2->size(0),
+                   alpha, THCTensor_(data)(state, vec1), vec1->stride(0),
+                   THCTensor_(data)(state, vec2), vec2->stride(0),
                    THCTensor_(data)(state, r_), r_->stride(1));
 #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dger(state, vec1->size(0), vec2_size,
-                   alpha, THCTensor_(data)(state, vec1), vec1_stride,
-                   THCTensor_(data)(state, vec2), vec2_stride,
+    THCudaBlas_Dger(state, vec1->size(0), vec2->size(0),
+                   alpha, THCTensor_(data)(state, vec1), vec1->stride(0),
+                   THCTensor_(data)(state, vec2), vec2->stride(0),
                    THCTensor_(data)(state, r_), r_->stride(1));
 #endif
   }
   else if(r_->stride(1) == 1)
   {
 #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sger(state, vec2_size, vec1_size,
-                   alpha, THCTensor_(data)(state, vec2), vec2_stride,
-                   THCTensor_(data)(state, vec1), vec1_stride,
+    THCudaBlas_Sger(state, vec2->size(0), vec1->size(0),
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride(0),
+                   THCTensor_(data)(state, vec1), vec1->stride(0),
                    THCTensor_(data)(state, r_), r_->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dger(state, vec2_size, vec1_size,
-                   alpha, THCTensor_(data)(state, vec2), vec2_stride,
-                   THCTensor_(data)(state, vec1), vec1_stride,
+    THCudaBlas_Dger(state, vec2->size(0), vec1->size(0),
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride(0),
+                   THCTensor_(data)(state, vec1), vec1->stride(0),
                    THCTensor_(data)(state, r_), r_->stride(0));
 #endif
   }
@@ -217,14 +209,14 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a
     THCTensor *cr = THCTensor_(newClone)(state, r_);
 
 #ifdef THC_REAL_IS_FLOAT
-    THCudaBlas_Sger(state, vec2_size, vec1_size,
-                   alpha, THCTensor_(data)(state, vec2), vec2_stride,
-                   THCTensor_(data)(state, vec1), vec1_stride,
+    THCudaBlas_Sger(state, vec2->size(0), vec1->size(0),
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride(0),
+                   THCTensor_(data)(state, vec1), vec1->stride(0),
                    THCTensor_(data)(state, cr), cr->stride(0));
 #elif defined(THC_REAL_IS_DOUBLE)
-    THCudaBlas_Dger(state, vec2_size, vec1_size,
-                   alpha, THCTensor_(data)(state, vec2), vec2_stride,
-                   THCTensor_(data)(state, vec1), vec1_stride,
+    THCudaBlas_Dger(state, vec2->size(0), vec1->size(0),
+                   alpha, THCTensor_(data)(state, vec2), vec2->stride(0),
+                   THCTensor_(data)(state, vec1), vec1->stride(0),
                    THCTensor_(data)(state, cr), cr->stride(0));
 #endif
 
@@ -233,11 +225,11 @@ THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real a
 #elif defined(THC_REAL_IS_HALF)
   // currently no Hger/SgerEx in Cublas.
   THCTensor *vec2T = THCTensor_(newWithTensor)(state, vec2);
-  THCTensor_(resize2d)(state, vec2T, vec2_size, 1);
+  THCTensor_(resize2d)(state, vec2T, vec2T->size(0), 1);
   THCTensor_(transpose)(state, vec2T, NULL, 0, 1);
 
   THCTensor *vec1M = THCTensor_(newWithTensor)(state, vec1);
-  THCTensor_(resize2d)(state, vec1M, vec1_size, 1);
+  THCTensor_(resize2d)(state, vec1M, vec1M->size(0), 1);
 
   THCTensor_(addmm)(state, r_, beta, t, alpha, vec1M, vec2T);
   THCTensor_(free)(state, vec2T);
diff --git a/aten/src/THCUNN/CMakeLists.txt b/aten/src/THCUNN/CMakeLists.txt
index 78faef7a7f227b..79b11c2db9b64f 100644
--- a/aten/src/THCUNN/CMakeLists.txt
+++ b/aten/src/THCUNN/CMakeLists.txt
@@ -43,6 +43,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}/SpatialDilatedMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFractionalMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullConvolution.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialFullDilatedConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/SpatialGridSamplerBilinear.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialMaxUnpooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/SpatialReflectionPadding.cu
@@ -70,6 +71,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricDilatedMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFractionalMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullConvolution.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricFullDilatedConvolution.cu
+${CMAKE_CURRENT_SOURCE_DIR}/VolumetricGridSamplerBilinear.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxPooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricMaxUnpooling.cu
 ${CMAKE_CURRENT_SOURCE_DIR}/VolumetricReplicationPadding.cu
diff --git a/aten/src/THCUNN/ELU.cu b/aten/src/THCUNN/ELU.cu
index 9c4c2ea1fdc8b6..d17d185b4858bf 100644
--- a/aten/src/THCUNN/ELU.cu
+++ b/aten/src/THCUNN/ELU.cu
@@ -8,17 +8,15 @@ struct ELUupdateOutput_functor
 {
   const T negcoef_;
   const T poscoef_;
-  const T negiptcoef_;
 
-  ELUupdateOutput_functor(T negcoef, T poscoef, T negiptcoef)
+  ELUupdateOutput_functor(T negcoef, T poscoef)
     : negcoef_(negcoef)
     , poscoef_(poscoef)
-    , negiptcoef_(negiptcoef)
   {}
 
   __device__ void operator()(T *output, const T *input) const
   {
-    *output = *input <= 0 ? (exp(*input * negiptcoef_) - 1) * negcoef_ : *input * poscoef_;
+    *output = *input <= 0 ? (exp(*input) - 1) * negcoef_ : *input * poscoef_;
   }
 };
 
@@ -28,17 +26,15 @@ struct ELUupdateOutputIP_functor
 {
   const T negcoef_;
   const T poscoef_;
-  const T negiptcoef_;
 
-  ELUupdateOutputIP_functor(T negcoef, T poscoef, T negiptcoef)
+  ELUupdateOutputIP_functor(T negcoef, T poscoef)
     : negcoef_(negcoef)
     , poscoef_(poscoef)
-    , negiptcoef_(negiptcoef)
   {}
 
   __device__ void operator()(T *x) const
   {
-    *x = *x <= 0 ? (exp(*x * negiptcoef_) - 1) * negcoef_ : *x * poscoef_;
+    *x = *x <= 0 ? (exp(*x) - 1) * negcoef_ : *x * poscoef_;
   }
 };
 
@@ -47,17 +43,15 @@ struct ELUupdateGradInput_functor
 {
   const T negcoef_;
   const T poscoef_;
-  const T negiptcoef_;
 
-  ELUupdateGradInput_functor(T negcoef, T poscoef, T negiptcoef)
+  ELUupdateGradInput_functor(T negcoef, T poscoef)
     : negcoef_(negcoef)
     , poscoef_(poscoef)
-    , negiptcoef_(negiptcoef)
   {}
 
   __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
   {
-    *gradInput = (*output) <= 0 ? (*gradOutput * negiptcoef_ * (*output + negcoef_)) : (*gradOutput * poscoef_);
+    *gradInput = (*output) <= 0 ? (*gradOutput * (*output + negcoef_)) : (*gradOutput * poscoef_);
   }
 };
 
diff --git a/aten/src/THCUNN/SpatialGridSamplerBilinear.cu b/aten/src/THCUNN/SpatialGridSamplerBilinear.cu
new file mode 100644
index 00000000000000..30a1a5d5ade10b
--- /dev/null
+++ b/aten/src/THCUNN/SpatialGridSamplerBilinear.cu
@@ -0,0 +1,243 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < W && y >= 0 && y < H)
+#define SAFE_ADD(input, x, y, n, c, H, W, value)    \
+  do {    \
+    if (WITHIN_BOUNDS(x, y, H, W)) {    \
+      atomicAdd(&input[n][c][y][x], value);   \
+    }   \
+  } while(0)
+
+#undef MIN
+#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
+#undef MAX
+#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
+#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0))
+
+const int MODE_BORDER = 1;
+
+
+template <typename Dtype>
+__launch_bounds__(1024)
+__global__ void SpatialGridSamplerBilinear_updateOutput_kernel(
+    const int nthreads,
+    THCDeviceTensor<Dtype, 4> input,
+    THCDeviceTensor<Dtype, 4> grid,
+    THCDeviceTensor<Dtype, 4> output,
+    const int padding_mode) {
+
+  int N = input.getSize(0);
+  int C = input.getSize(1);
+  int IH = input.getSize(2);
+  int IW = input.getSize(3);
+  int H = grid.getSize(1);
+  int W = grid.getSize(2);
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+
+    const int n = index % N;
+    const int h = (index / N) % H;
+    const int w = (index / (N * H)) % W;
+    int c;
+
+    // get the corresponding input x, y co-ordinates from grid
+    Dtype ix = grid[n][h][w][0];
+    Dtype iy = grid[n][h][w][1];
+
+    // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1]
+    ix = ScalarConvert<float,Dtype>::to(((ix + 1.f) / 2) * (IW-1));
+    iy = ScalarConvert<float,Dtype>::to(((iy + 1.f) / 2) * (IH-1));
+
+    // get NE, NW, SE, SW pixel values from (x, y)
+    int ix_nw = floor(ScalarConvert<Dtype,float>::to(ix));
+    int iy_nw = floor(ScalarConvert<Dtype,float>::to(iy));
+    int ix_ne = ix_nw + 1;
+    int iy_ne = iy_nw;
+    int ix_sw = ix_nw;
+    int iy_sw = iy_nw + 1;
+    int ix_se = ix_nw + 1;
+    int iy_se = iy_nw + 1;
+
+    // get surfaces to each neighbor:
+    Dtype nw = (ix_se - ix)    * (iy_se - iy);
+    Dtype ne = (ix    - ix_sw) * (iy_sw - iy);
+    Dtype sw = (ix_ne - ix)    * (iy    - iy_ne);
+    Dtype se = (ix    - ix_nw) * (iy    - iy_nw);
+
+    // calculate bilinear weighted pixel value and set output pixel
+    if (padding_mode==MODE_BORDER){
+      // clip coordinates to image borders
+      CLIP_COORDINATES(ix_nw, ix_nw, IW);
+      CLIP_COORDINATES(iy_nw, iy_nw, IH);
+      CLIP_COORDINATES(ix_ne, ix_ne, IW);
+      CLIP_COORDINATES(iy_ne, iy_ne, IH);
+      CLIP_COORDINATES(ix_sw, ix_sw, IW);
+      CLIP_COORDINATES(iy_sw, iy_sw, IH);
+      CLIP_COORDINATES(ix_se, ix_se, IW);
+      CLIP_COORDINATES(iy_se, iy_se, IH);
+    }
+
+    Dtype out_val;
+    for (c = 0; c < C; ++c) {
+      out_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_nw, iy_nw, IH, IW)) {
+        out_val += input[n][c][iy_nw][ix_nw] * nw;
+      }
+      if (WITHIN_BOUNDS(ix_ne, iy_ne, IH, IW)) {
+        out_val += input[n][c][iy_ne][ix_ne] * ne;
+      }
+      if (WITHIN_BOUNDS(ix_sw, iy_sw, IH, IW)) {
+        out_val += input[n][c][iy_sw][ix_sw] * sw;
+      }
+      if (WITHIN_BOUNDS(ix_se, iy_se, IH, IW)) {
+        out_val += input[n][c][iy_se][ix_se] * se;
+      }
+      output[n][c][h][w] = out_val;
+    }
+  }
+}
+
+template <typename Dtype>
+__launch_bounds__(1024)
+__global__ void SpatialGridSamplerBilinear_updateGradInput_kernel(
+    const int nthreads,
+    THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<Dtype, 4> gradInput,
+    THCDeviceTensor<Dtype, 4> grid, THCDeviceTensor<Dtype, 4> gradGrid,
+    THCDeviceTensor<Dtype, 4> gradOutput,
+    const int padding_mode) {
+
+  int N = input.getSize(0);
+  int C = input.getSize(1);
+  int IH = input.getSize(2);
+  int IW = input.getSize(3);
+  int H = grid.getSize(1);
+  int W = grid.getSize(2);
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+
+    const int n = index % N;
+    const int h = (index / N) % H;
+    const int w = (index / (N * H)) % W;
+
+    // get the corresponding input x, y co-ordinates from grid
+    Dtype ix = grid[n][h][w][0];
+    Dtype iy = grid[n][h][w][1];
+
+    Dtype gix = ScalarConvert<int,Dtype>::to(0);
+    Dtype giy = ScalarConvert<int,Dtype>::to(0);
+
+    // normalize ix, iy from [-1, 1] to [0, H-1] & [0, W-1]
+    ix = ScalarConvert<float,Dtype>::to(((ix + 1.f) / 2) * (IW-1));
+    iy = ScalarConvert<float,Dtype>::to(((iy + 1.f) / 2) * (IH-1));;
+
+    // get NE, NW, SE, SW pixel values from (x, y)
+    int ix_nw = floor(ScalarConvert<Dtype,float>::to(ix));
+    int iy_nw = floor(ScalarConvert<Dtype,float>::to(iy));;
+    int ix_ne = ix_nw + 1;
+    int iy_ne = iy_nw;
+    int ix_sw = ix_nw;
+    int iy_sw = iy_nw + 1;
+    int ix_se = ix_nw + 1;
+    int iy_se = iy_nw + 1;
+
+    // get surfaces to each neighbor:
+    Dtype nw = (ix_se - ix)    * (iy_se - iy);
+    Dtype ne = (ix    - ix_sw) * (iy_sw - iy);
+    Dtype sw = (ix_ne - ix)    * (iy    - iy_ne);
+    Dtype se = (ix    - ix_nw) * (iy    - iy_nw);
+
+    Dtype gradout;
+    Dtype nw_val;
+    Dtype ne_val;
+    Dtype sw_val;
+    Dtype se_val;
+    
+    int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl;
+
+    if (padding_mode==MODE_BORDER){
+      // get clipped NE, NW, SE, SW pixel values from (x, y)
+      CLIP_COORDINATES(ix_nw, ix_nw_cl, IW);
+      CLIP_COORDINATES(iy_nw, iy_nw_cl, IH);
+      CLIP_COORDINATES(ix_ne, ix_ne_cl, IW);
+      CLIP_COORDINATES(iy_ne, iy_ne_cl, IH);
+      CLIP_COORDINATES(ix_sw, ix_sw_cl, IW);
+      CLIP_COORDINATES(iy_sw, iy_sw_cl, IH);
+      CLIP_COORDINATES(ix_se, ix_se_cl, IW);
+      CLIP_COORDINATES(iy_se, iy_se_cl, IH);
+    }
+    else {
+      ix_nw_cl = ix_nw;
+      iy_nw_cl = iy_nw;
+      ix_ne_cl = ix_ne;
+      iy_ne_cl = iy_ne;
+      ix_sw_cl = ix_sw;
+      iy_sw_cl = iy_sw;
+      ix_se_cl = ix_se;
+      iy_se_cl = iy_se;
+    }
+
+    for (int c = 0; c < C; ++c) {
+      gradout = gradOutput[n][c][h][w];
+
+      // calculate and set gradInput
+      SAFE_ADD(gradInput, ix_nw_cl, iy_nw_cl, n, c, IH, IW, nw * gradout);
+      SAFE_ADD(gradInput, ix_ne_cl, iy_ne_cl, n, c, IH, IW, ne * gradout);
+      SAFE_ADD(gradInput, ix_sw_cl, iy_sw_cl, n, c, IH, IW, sw * gradout);
+      SAFE_ADD(gradInput, ix_se_cl, iy_se_cl, n, c, IH, IW, se * gradout);
+
+      // calculate gradGrid
+      nw_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_nw_cl, iy_nw_cl, IH, IW)) {
+        nw_val = input[n][c][iy_nw_cl][ix_nw_cl];
+      }
+      ne_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_ne_cl, iy_ne_cl, IH, IW)) {
+        ne_val = input[n][c][iy_ne_cl][ix_ne_cl];
+      }
+      sw_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_sw_cl, iy_sw_cl, IH, IW)) {
+        sw_val = input[n][c][iy_sw_cl][ix_sw_cl];
+      }
+      se_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_se_cl, iy_se_cl, IH, IW)) {
+        se_val = input[n][c][iy_se_cl][ix_se_cl];
+      }
+
+      gix += ScalarConvert<int,Dtype>::to(-1)*(nw_val * (iy_se - iy) * gradout);
+      gix += ne_val * (iy_sw - iy) * gradout;
+      gix += ScalarConvert<int,Dtype>::to(-1)*(sw_val * (iy - iy_ne) * gradout);
+      gix += se_val * (iy - iy_nw) * gradout;
+
+      giy += ScalarConvert<int,Dtype>::to(-1)*(nw_val * (ix_se - ix) * gradout);
+      giy += ScalarConvert<int,Dtype>::to(-1)*(ne_val * (ix - ix_sw) * gradout);
+      giy += sw_val * (ix_ne - ix) * gradout;
+      giy += se_val * (ix - ix_nw) * gradout;
+    }
+
+    // un-normalize gradGrid values back to [-1, 1] constraints
+    gix = gix * (IW - 1) / 2;
+    giy = giy * (IH - 1) / 2;
+
+    Dtype gix_old = gradGrid[n][h][w][0];
+    Dtype giy_old = gradGrid[n][h][w][1];
+
+    gradGrid[n][h][w][0] = gix_old + gix;
+    gradGrid[n][h][w][1] = giy_old + giy;
+  }
+}
+
+#undef MIN
+#undef MAX
+#undef CLIP_COORDINATES
+#undef WITHIN_BOUNDS
+#undef SAFE_ADD
+
+#include "generic/SpatialGridSamplerBilinear.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu b/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu
new file mode 100644
index 00000000000000..43b8ceff1cb8ae
--- /dev/null
+++ b/aten/src/THCUNN/VolumetricGridSamplerBilinear.cu
@@ -0,0 +1,421 @@
+#include "THCUNN.h"
+#include "common.h"
+#include "THCDeviceTensor.cuh"
+#include "THCDeviceTensorUtils.cuh"
+#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+#define WITHIN_BOUNDS(x, y, z, D, H, W) (x >= 0 && x < W && y >= 0 && y < H && z >= 0 && z < D)
+#define SAFE_ADD(input, x, y, z, n, c, D, H, W, value)	\
+  do {    \
+    if (WITHIN_BOUNDS(x, y, z, D, H, W)) {    \
+      atomicAdd(&input[n][c][z][y][x], value);	\
+    }						\
+  } while(0)
+
+#undef MIN
+#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
+#undef MAX
+#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
+#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0))
+
+const int MODE_BORDER = 1;
+
+
+template <typename Dtype>
+__launch_bounds__(1024)
+__global__ void VolumetricGridSamplerBilinear_updateOutput_kernel(
+    const int nthreads,
+    THCDeviceTensor<Dtype, 5> input,
+    THCDeviceTensor<Dtype, 5> grid,
+    THCDeviceTensor<Dtype, 5> output,
+    const int padding_mode) {
+
+  int N = input.getSize(0);
+  int C = input.getSize(1);
+  int ID = input.getSize(2);
+  int IH = input.getSize(3);
+  int IW = input.getSize(4);
+  int D = grid.getSize(1);
+  int H = grid.getSize(2);
+  int W = grid.getSize(3);
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+
+    const int n = index % N;
+    const int d = (index / N) % D;
+    const int h = (index / (N * D)) % H;
+    const int w = (index / (N * D * H)) % W;
+    int c;
+
+    // get the corresponding input x, y, z co-ordinates from grid
+    Dtype ix = grid[n][d][h][w][0];
+    Dtype iy = grid[n][d][h][w][1];
+    Dtype iz = grid[n][d][h][w][2];
+
+    // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1]
+    ix = ScalarConvert<float,Dtype>::to(((ix + 1.f) / 2) * (IW-1));
+    iy = ScalarConvert<float,Dtype>::to(((iy + 1.f) / 2) * (IH-1));
+    iz = ScalarConvert<float,Dtype>::to(((iz + 1.f) / 2) * (ID-1));
+
+    // get corner pixel values from (x, y, z)
+    // for 4d, we used north-east-south-west
+    // for 5d, we add top-bottom
+    int ix_tnw = floor(ScalarConvert<Dtype,float>::to(ix));
+    int iy_tnw = floor(ScalarConvert<Dtype,float>::to(iy));
+    int iz_tnw = floor(ScalarConvert<Dtype,float>::to(iz));
+    
+    int ix_tne = ix_tnw + 1;
+    int iy_tne = iy_tnw;
+    int iz_tne = iz_tnw;
+
+    int ix_tsw = ix_tnw;
+    int iy_tsw = iy_tnw + 1;
+    int iz_tsw = iz_tnw;
+
+    int ix_tse = ix_tnw + 1;
+    int iy_tse = iy_tnw + 1;
+    int iz_tse = iz_tnw;
+
+    int ix_bnw = ix_tnw;
+    int iy_bnw = iy_tnw;
+    int iz_bnw = iz_tnw + 1;
+
+    int ix_bne = ix_tnw + 1;
+    int iy_bne = iy_tnw;
+    int iz_bne = iz_tnw + 1;
+
+    int ix_bsw = ix_tnw;
+    int iy_bsw = iy_tnw + 1;
+    int iz_bsw = iz_tnw + 1;
+
+    int ix_bse = ix_tnw + 1;
+    int iy_bse = iy_tnw + 1;
+    int iz_bse = iz_tnw + 1;
+
+    // get surfaces to each neighbor:
+    Dtype tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+    Dtype tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+    Dtype tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+    Dtype tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+    Dtype bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+    Dtype bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+    Dtype bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+    Dtype bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+
+    // calculate bilinear weighted pixel value and set output pixel
+    if (padding_mode==MODE_BORDER){
+      // clip coordinates to image borders
+      CLIP_COORDINATES(ix_tnw, ix_tnw, IW);
+      CLIP_COORDINATES(iy_tnw, iy_tnw, IH);
+      CLIP_COORDINATES(iz_tnw, iz_tnw, ID);
+      CLIP_COORDINATES(ix_tne, ix_tne, IW);
+      CLIP_COORDINATES(iy_tne, iy_tne, IH);
+      CLIP_COORDINATES(iz_tne, iz_tne, ID);
+      CLIP_COORDINATES(ix_tsw, ix_tsw, IW);
+      CLIP_COORDINATES(iy_tsw, iy_tsw, IH);
+      CLIP_COORDINATES(iz_tsw, iz_tsw, ID);
+      CLIP_COORDINATES(ix_tse, ix_tse, IW);
+      CLIP_COORDINATES(iy_tse, iy_tse, IH);
+      CLIP_COORDINATES(iz_tse, iz_tse, ID);
+      CLIP_COORDINATES(ix_bnw, ix_bnw, IW);
+      CLIP_COORDINATES(iy_bnw, iy_bnw, IH);
+      CLIP_COORDINATES(iz_bnw, iz_bnw, ID);
+      CLIP_COORDINATES(ix_bne, ix_bne, IW);
+      CLIP_COORDINATES(iy_bne, iy_bne, IH);
+      CLIP_COORDINATES(iz_bne, iz_bne, ID);
+      CLIP_COORDINATES(ix_bsw, ix_bsw, IW);
+      CLIP_COORDINATES(iy_bsw, iy_bsw, IH);
+      CLIP_COORDINATES(iz_bsw, iz_bsw, ID);
+      CLIP_COORDINATES(ix_bse, ix_bse, IW);
+      CLIP_COORDINATES(iy_bse, iy_bse, IH);
+      CLIP_COORDINATES(iz_bse, iz_bse, ID);
+    }
+
+    Dtype out_val;
+    for (c = 0; c < C; ++c) {
+      out_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_tnw, iy_tnw, iz_tnw, ID, IH, IW)) {
+        out_val += input[n][c][iz_tnw][iy_tnw][ix_tnw] * tnw;
+      }
+      if (WITHIN_BOUNDS(ix_tne, iy_tne, iz_tne, ID, IH, IW)) {
+        out_val += input[n][c][iz_tne][iy_tne][ix_tne] * tne;
+      }
+      if (WITHIN_BOUNDS(ix_tsw, iy_tsw, iz_tsw, ID, IH, IW)) {
+        out_val += input[n][c][iz_tsw][iy_tsw][ix_tsw] * tsw;
+      }
+      if (WITHIN_BOUNDS(ix_tse, iy_tse, iz_tse, ID, IH, IW)) {
+        out_val += input[n][c][iz_tse][iy_tse][ix_tse] * tse;
+      }
+      if (WITHIN_BOUNDS(ix_bnw, iy_bnw, iz_bnw, ID, IH, IW)) {
+        out_val += input[n][c][iz_bnw][iy_bnw][ix_bnw] * bnw;
+      }
+      if (WITHIN_BOUNDS(ix_bne, iy_bne, iz_bne, ID, IH, IW)) {
+        out_val += input[n][c][iz_bne][iy_bne][ix_bne] * bne;
+      }
+      if (WITHIN_BOUNDS(ix_bsw, iy_bsw, iz_bsw, ID, IH, IW)) {
+        out_val += input[n][c][iz_bsw][iy_bsw][ix_bsw] * bsw;
+      }
+      if (WITHIN_BOUNDS(ix_bse, iy_bse, iz_bse, ID, IH, IW)) {
+        out_val += input[n][c][iz_bse][iy_bse][ix_bse] * bse;
+      }
+      output[n][c][d][h][w] = out_val;
+    }
+  }
+}
+
+template <typename Dtype>
+__launch_bounds__(1024)
+__global__ void VolumetricGridSamplerBilinear_updateGradInput_kernel(
+    const int nthreads,
+    THCDeviceTensor<Dtype, 5> input, THCDeviceTensor<Dtype, 5> gradInput,
+    THCDeviceTensor<Dtype, 5> grid, THCDeviceTensor<Dtype, 5> gradGrid,
+    THCDeviceTensor<Dtype, 5> gradOutput,
+    const int padding_mode) {
+
+  int N = input.getSize(0);
+  int C = input.getSize(1);
+  int ID = input.getSize(2);
+  int IH = input.getSize(3);
+  int IW = input.getSize(4);
+  int D = grid.getSize(1);
+  int H = grid.getSize(2);
+  int W = grid.getSize(3);
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+
+    const int n = index % N;
+    const int d = (index / N) % D;
+    const int h = (index / (N * D)) % H;
+    const int w = (index / (N * D * H)) % W;
+
+    // get the corresponding input x, y, z co-ordinates from grid
+    Dtype ix = grid[n][d][h][w][0];
+    Dtype iy = grid[n][d][h][w][1];
+    Dtype iz = grid[n][d][h][w][2];
+
+    Dtype gix = ScalarConvert<int,Dtype>::to(0);
+    Dtype giy = ScalarConvert<int,Dtype>::to(0);
+    Dtype giz = ScalarConvert<int,Dtype>::to(0);
+
+    // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1]
+    ix = ScalarConvert<float,Dtype>::to(((ix + 1.f) / 2) * (IW-1));
+    iy = ScalarConvert<float,Dtype>::to(((iy + 1.f) / 2) * (IH-1));
+    iz = ScalarConvert<float,Dtype>::to(((iz + 1.f) / 2) * (ID-1));
+
+    // get corner pixel values from (x, y, z)
+    // for 4d, we used north-east-south-west
+    // for 5d, we add top-bottom
+    int ix_tnw = floor(ScalarConvert<Dtype,float>::to(ix));
+    int iy_tnw = floor(ScalarConvert<Dtype,float>::to(iy));
+    int iz_tnw = floor(ScalarConvert<Dtype,float>::to(iz));
+    
+    int ix_tne = ix_tnw + 1;
+    int iy_tne = iy_tnw;
+    int iz_tne = iz_tnw;
+
+    int ix_tsw = ix_tnw;
+    int iy_tsw = iy_tnw + 1;
+    int iz_tsw = iz_tnw;
+
+    int ix_tse = ix_tnw + 1;
+    int iy_tse = iy_tnw + 1;
+    int iz_tse = iz_tnw;
+
+    int ix_bnw = ix_tnw;
+    int iy_bnw = iy_tnw;
+    int iz_bnw = iz_tnw + 1;
+
+    int ix_bne = ix_tnw + 1;
+    int iy_bne = iy_tnw;
+    int iz_bne = iz_tnw + 1;
+
+    int ix_bsw = ix_tnw;
+    int iy_bsw = iy_tnw + 1;
+    int iz_bsw = iz_tnw + 1;
+
+    int ix_bse = ix_tnw + 1;
+    int iy_bse = iy_tnw + 1;
+    int iz_bse = iz_tnw + 1;
+
+    // get surfaces to each neighbor:
+    Dtype tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+    Dtype tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+    Dtype tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+    Dtype tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+    Dtype bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+    Dtype bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+    Dtype bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+    Dtype bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+
+    Dtype gradout;
+    Dtype tnw_val;
+    Dtype tne_val;
+    Dtype tsw_val;
+    Dtype tse_val;
+    Dtype bnw_val;
+    Dtype bne_val;
+    Dtype bsw_val;
+    Dtype bse_val;
+    
+    int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl;
+    int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl;
+    int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl;
+    int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl;
+
+    if (padding_mode==MODE_BORDER){
+      // clip coordinates to image borders
+      CLIP_COORDINATES(ix_tnw, ix_tnw_cl, IW);
+      CLIP_COORDINATES(iy_tnw, iy_tnw_cl, IH);
+      CLIP_COORDINATES(iz_tnw, iz_tnw_cl, ID);
+      CLIP_COORDINATES(ix_tne, ix_tne_cl, IW);
+      CLIP_COORDINATES(iy_tne, iy_tne_cl, IH);
+      CLIP_COORDINATES(iz_tne, iz_tne_cl, ID);
+      CLIP_COORDINATES(ix_tsw, ix_tsw_cl, IW);
+      CLIP_COORDINATES(iy_tsw, iy_tsw_cl, IH);
+      CLIP_COORDINATES(iz_tsw, iz_tsw_cl, ID);
+      CLIP_COORDINATES(ix_tse, ix_tse_cl, IW);
+      CLIP_COORDINATES(iy_tse, iy_tse_cl, IH);
+      CLIP_COORDINATES(iz_tse, iz_tse_cl, ID);
+      CLIP_COORDINATES(ix_bnw, ix_bnw_cl, IW);
+      CLIP_COORDINATES(iy_bnw, iy_bnw_cl, IH);
+      CLIP_COORDINATES(iz_bnw, iz_bnw_cl, ID);
+      CLIP_COORDINATES(ix_bne, ix_bne_cl, IW);
+      CLIP_COORDINATES(iy_bne, iy_bne_cl, IH);
+      CLIP_COORDINATES(iz_bne, iz_bne_cl, ID);
+      CLIP_COORDINATES(ix_bsw, ix_bsw_cl, IW);
+      CLIP_COORDINATES(iy_bsw, iy_bsw_cl, IH);
+      CLIP_COORDINATES(iz_bsw, iz_bsw_cl, ID);
+      CLIP_COORDINATES(ix_bse, ix_bse_cl, IW);
+      CLIP_COORDINATES(iy_bse, iy_bse_cl, IH);
+      CLIP_COORDINATES(iz_bse, iz_bse_cl, ID);
+    }
+    else {
+      ix_tnw_cl = ix_tnw;
+      iy_tnw_cl = iy_tnw;
+      iz_tnw_cl = iz_tnw;
+      ix_tne_cl = ix_tne;
+      iy_tne_cl = iy_tne;
+      iz_tne_cl = iz_tne;
+      ix_tsw_cl = ix_tsw;
+      iy_tsw_cl = iy_tsw;
+      iz_tsw_cl = iz_tsw;
+      ix_tse_cl = ix_tse;
+      iy_tse_cl = iy_tse;
+      iz_tse_cl = iz_tse;
+      ix_bnw_cl = ix_bnw;
+      iy_bnw_cl = iy_bnw;
+      iz_bnw_cl = iz_bnw;
+      ix_bne_cl = ix_bne;
+      iy_bne_cl = iy_bne;
+      iz_bne_cl = iz_bne;
+      ix_bsw_cl = ix_bsw;
+      iy_bsw_cl = iy_bsw;
+      iz_bsw_cl = iz_bsw;
+      ix_bse_cl = ix_bse;
+      iy_bse_cl = iy_bse;
+      iz_bse_cl = iz_bse;
+    }
+
+    for (int c = 0; c < C; ++c) {
+      gradout = gradOutput[n][c][d][h][w];
+
+      // calculate and set gradInput
+      SAFE_ADD(gradInput, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW, tnw * gradout);
+      SAFE_ADD(gradInput, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW, tne * gradout);
+      SAFE_ADD(gradInput, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW, tsw * gradout);
+      SAFE_ADD(gradInput, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW, tse * gradout);
+      SAFE_ADD(gradInput, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW, bnw * gradout);
+      SAFE_ADD(gradInput, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW, bne * gradout);
+      SAFE_ADD(gradInput, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW, bsw * gradout);
+      SAFE_ADD(gradInput, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW, bse * gradout);
+
+      // calculate gradGrid
+      tnw_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ID, IH, IW)) {
+        tnw_val = input[n][c][iz_tnw_cl][iy_tnw_cl][ix_tnw_cl];
+      }
+      tne_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_tne_cl, iy_tne_cl, iz_tne_cl, ID, IH, IW)) {
+        tne_val = input[n][c][iz_tne_cl][iy_tne_cl][ix_tne_cl];
+      }
+      tsw_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ID, IH, IW)) {
+        tsw_val = input[n][c][iz_tsw_cl][iy_tsw_cl][ix_tsw_cl];
+      }
+      tse_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_tse_cl, iy_tse_cl, iz_tse_cl, ID, IH, IW)) {
+        tse_val = input[n][c][iz_tse_cl][iy_tse_cl][ix_tse_cl];
+      }
+      bnw_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ID, IH, IW)) {
+        bnw_val = input[n][c][iz_bnw_cl][iy_bnw_cl][ix_bnw_cl];
+      }
+      bne_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_bne_cl, iy_bne_cl, iz_bne_cl, ID, IH, IW)) {
+        bne_val = input[n][c][iz_bne_cl][iy_bne_cl][ix_bne_cl];
+      }
+      bsw_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ID, IH, IW)) {
+        bsw_val = input[n][c][iz_bsw_cl][iy_bsw_cl][ix_bsw_cl];
+      }
+      bse_val = ScalarConvert<int,Dtype>::to(0);
+      if (WITHIN_BOUNDS(ix_bse_cl, iy_bse_cl, iz_bse_cl, ID, IH, IW)) {
+        bse_val = input[n][c][iz_bse_cl][iy_bse_cl][ix_bse_cl];
+      }
+
+      Dtype m1 = ScalarConvert<int,Dtype>::to(-1);
+      gix += m1 * tnw_val * (iy_bse - iy) * (iz_bse - iz) * gradout;
+      gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gradout;
+      gix += m1 * tsw_val * (iy - iy_bne) * (iz_bne - iz) * gradout;
+      gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gradout;
+      gix += m1 * bnw_val * (iy_tse - iy) * (iz - iz_tse) * gradout;
+      gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gradout;
+      gix += m1 * bsw_val * (iy - iy_tne) * (iz - iz_tne) * gradout;
+      gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gradout;
+
+
+      giy += m1 * tnw_val * (ix_bse - ix)    * (iz_bse - iz) * gradout;
+      giy += m1 * tne_val * (ix    - ix_bsw) * (iz_bsw - iz) * gradout;
+      giy += tsw_val * (ix_bne - ix)    * (iz_bne - iz) * gradout;
+      giy += tse_val * (ix    - ix_bnw) * (iz_bnw - iz) * gradout;
+      giy += m1 * bnw_val * (ix_tse - ix)    * (iz - iz_tse) * gradout;
+      giy += m1 * bne_val * (ix    - ix_tsw) * (iz - iz_tsw) * gradout;
+      giy += bsw_val * (ix_tne - ix)    * (iz - iz_tne) * gradout;
+      giy += bse_val * (ix    - ix_tnw) * (iz - iz_tnw) * gradout;
+
+      giz += m1 * tnw_val * (ix_bse - ix)    * (iy_bse - iy)    * gradout;
+      giz += m1 * tne_val * (ix    - ix_bsw) * (iy_bsw - iy)    * gradout;
+      giz += m1 * tsw_val * (ix_bne - ix)    * (iy    - iy_bne) * gradout;
+      giz += m1 * tse_val * (ix    - ix_bnw) * (iy    - iy_bnw) * gradout;
+      giz += bnw_val * (ix_tse - ix)    * (iy_tse - iy)    * gradout;
+      giz += bne_val * (ix    - ix_tsw) * (iy_tsw - iy)    * gradout;
+      giz += bsw_val * (ix_tne - ix)    * (iy    - iy_tne) * gradout;
+      giz += bse_val * (ix    - ix_tnw) * (iy    - iy_tnw) * gradout;
+    }
+
+    // un-normalize gradGrid values back to [-1, 1] constraints
+    gix = gix * (IW - 1) / 2;
+    giy = giy * (IH - 1) / 2;
+    giz = giz * (ID - 1) / 2;
+
+    Dtype gix_old = gradGrid[n][d][h][w][0];
+    Dtype giy_old = gradGrid[n][d][h][w][1];
+    Dtype giz_old = gradGrid[n][d][h][w][2];
+
+    gradGrid[n][d][h][w][0] = gix_old + gix;
+    gradGrid[n][d][h][w][1] = giy_old + giy;
+    gradGrid[n][d][h][w][2] = giz_old + giz;
+  }
+}
+
+#undef MIN
+#undef MAX
+#undef CLIP_COORDINATES
+#undef WITHIN_BOUNDS
+#undef SAFE_ADD
+
+#include "generic/VolumetricGridSamplerBilinear.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index e2a99640ba69b6..47f9bee0fb6744 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -62,7 +62,7 @@ inline int GET_BLOCKS(const int N)
 
 #define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \
   if (THCTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM ||             \
-      THCTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) {        \
+      THCTensor_(size)(STATE, T, DIM_SIZE) != SIZE) {        \
       THCDescBuff s1 = THCTensor_(sizeDesc)(state, T);       \
       THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
               " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
@@ -70,7 +70,7 @@ inline int GET_BLOCKS(const int N)
 
 #define THCUNN_check_dim_size_indices(STATE, T, DIM, DIM_SIZE, SIZE)  \
   if (THCIndexTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM ||                 \
-      THCIndexTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) {            \
+      THCIndexTensor_(size)(STATE, T, DIM_SIZE) != SIZE) {            \
       THCDescBuff s1 = THCIndexTensor_(sizeDesc)(state, T);           \
       THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
               " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
diff --git a/aten/src/THCUNN/generic/BatchNormalization.cu b/aten/src/THCUNN/generic/BatchNormalization.cu
index 81eabc68812f36..03dd38a7bd76ee 100644
--- a/aten/src/THCUNN/generic/BatchNormalization.cu
+++ b/aten/src/THCUNN/generic/BatchNormalization.cu
@@ -21,11 +21,11 @@ static THCDeviceTensor<real, Dim> THNN_(devicetensor)(THCState *state, THCTensor
   int size[Dim];
   for (int i = 0; i < Dim || i < inDim; ++i) {
     if (i < Dim && i < inDim) {
-      size[i] = THTensor_sizeLegacyNoScalars(t, i);
+      size[i] = t->size(i);
     } else if (i < Dim) {
       size[i] = 1;
     } else {
-      size[Dim - 1] *= THTensor_sizeLegacyNoScalars(t, i);
+      size[Dim - 1] *= t->size(i);
     }
   }
   return THCDeviceTensor<real, Dim>(t->data<real>(), size);
diff --git a/aten/src/THCUNN/generic/ClassNLLCriterion.cu b/aten/src/THCUNN/generic/ClassNLLCriterion.cu
index 6866c5798f7d23..6126dee76dcb27 100644
--- a/aten/src/THCUNN/generic/ClassNLLCriterion.cu
+++ b/aten/src/THCUNN/generic/ClassNLLCriterion.cu
@@ -16,7 +16,7 @@ void THNN_(ClassNLLCriterion_updateOutput)(
   }
 
   int n_dims = THCTensor_(nDimensionLegacyNoScalars)(state, input);
-  int n_classes = THCTensor_(sizeLegacyNoScalars)(state, input, n_dims - 1);
+  int n_classes = THCTensor_(size)(state, input, n_dims - 1);
   ignore_index -= TH_INDEX_BASE;
 
   if (weights) {
@@ -31,8 +31,8 @@ void THNN_(ClassNLLCriterion_updateOutput)(
 
   THArgCheck(!input->is_empty() && (n_dims <= 2 && n_dims > 0), 2, "non-empty vector or matrix expected");
 
-  int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(sizeLegacyNoScalars)(state, input, 0);
-  int64_t num_targets = THCudaLongTensor_sizeLegacyNoScalars(state, target, 0);
+  int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(size)(state, input, 0);
+  int64_t num_targets = THCudaLongTensor_size(state, target, 0);
   THArgCheck(batch_size == num_targets,
       2, "mismatch between the batch size of input (%ld) and that of target (%ld)",
       batch_size, num_targets);
@@ -152,7 +152,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
   THArgCheck(!input->is_empty() && (n_dims <= 2 && n_dims > 0), 2, "non-empty vector or matrix expected");
 
   int64_t batch_size = n_dims == 1 ? 1 : THCTensor_(size)(state, input, 0);
-  int64_t num_targets = THCudaLongTensor_sizeLegacyNoScalars(state, target, 0);
+  int64_t num_targets = THCudaLongTensor_size(state, target, 0);
   THArgCheck(batch_size == num_targets,
       2, "mismatch between the batch size of input (%ld) and that of target (%ld)",
       batch_size, num_targets);
diff --git a/aten/src/THCUNN/generic/ELU.cu b/aten/src/THCUNN/generic/ELU.cu
index 6f78349110ec35..5c09a0607f0246 100644
--- a/aten/src/THCUNN/generic/ELU.cu
+++ b/aten/src/THCUNN/generic/ELU.cu
@@ -11,23 +11,21 @@ void THNN_(ELU_updateOutput)(
            THCTensor *output,
            accreal alpha,
            accreal scale,
-           accreal input_scale,
            bool inplace)
 {
   real negcoef = ScalarConvert<accreal, real>::to(alpha * scale);
-  real poscoef = ScalarConvert<accreal, real>::to(scale * input_scale);
-  real negiptcoef = ScalarConvert<accreal, real>::to(input_scale);
+  real poscoef = ScalarConvert<accreal, real>::to(scale);
   THCUNN_assertSameGPU(state, 2, input, output);
 
   if (inplace)
   {
-    THC_pointwiseApply1<real>(state, input, ELUupdateOutputIP_functor<real>(negcoef, poscoef, negiptcoef));
+    THC_pointwiseApply1<real>(state, input, ELUupdateOutputIP_functor<real>(negcoef, poscoef));
     THCTensor_(set)(state, output, input);
   }
   else
   {
     THCTensor_(resizeAs)(state, output, input);
-    THC_pointwiseApply2<real, real>(state, output, input, ELUupdateOutput_functor<real>(negcoef, poscoef, negiptcoef));
+    THC_pointwiseApply2<real, real>(state, output, input, ELUupdateOutput_functor<real>(negcoef, poscoef));
   }
 }
 
@@ -38,17 +36,15 @@ void THNN_(ELU_updateGradInput)(
            THCTensor *gradInput,
            THCTensor *output,
            accreal alpha,
-           accreal scale,
-           accreal input_scale)
+           accreal scale)
 {
   real negcoef = ScalarConvert<accreal, real>::to(alpha * scale);
-  real poscoef = ScalarConvert<accreal, real>::to(scale * input_scale);
-  real negiptcoef = ScalarConvert<accreal, real>::to(input_scale);
+  real poscoef = ScalarConvert<accreal, real>::to(scale);
   THCUNN_check_nElement(state, output, gradOutput);
   THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
 
   THCTensor_(resizeAs)(state, gradInput, output);
-  THC_pointwiseApply3<real, real, real>(state, gradInput, output, gradOutput, ELUupdateGradInput_functor<real>(negcoef, poscoef, negiptcoef));
+  THC_pointwiseApply3<real, real, real>(state, gradInput, output, gradOutput, ELUupdateGradInput_functor<real>(negcoef, poscoef));
 }
 
 #endif
diff --git a/aten/src/THCUNN/generic/GatedLinearUnit.cu b/aten/src/THCUNN/generic/GatedLinearUnit.cu
index 9bd59eec538cb6..4622403e76088f 100644
--- a/aten/src/THCUNN/generic/GatedLinearUnit.cu
+++ b/aten/src/THCUNN/generic/GatedLinearUnit.cu
@@ -12,7 +12,7 @@ void THNN_(GatedLinear_updateOutput)(
 
   // size output to half of input
   dim = dim - TH_INDEX_BASE;
-  const int64_t nIn = THCTensor_(sizeLegacyNoScalars)(state, input, dim);
+  const int64_t nIn = THCTensor_(size)(state, input, dim);
   THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
       dim + TH_INDEX_BASE, nIn);
   const int64_t inputSize = THCTensor_(size)(state, input, dim) / 2;
diff --git a/aten/src/THCUNN/generic/MultiMarginCriterion.cu b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
index 65bd6cdec850bb..8272b3d4020ec7 100644
--- a/aten/src/THCUNN/generic/MultiMarginCriterion.cu
+++ b/aten/src/THCUNN/generic/MultiMarginCriterion.cu
@@ -18,7 +18,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   input = THCTensor_(newContiguous)(state, input);
   if(weights)
     weights = THCTensor_(newContiguous)(state, weights);
-  if (THTensor_nDimensionLegacyNoScalars(input) == 1)
+  if (input->dim() == 1)
   {
     dim3 blocks(1);
     dim3 threads(MULTIMARGIN_THREADS);
@@ -30,7 +30,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, THTensor_sizeLegacyNoScalars(input, 0),
+        1, input->size(0),
         reduction == Reduction::ElementwiseMean,
         margin
       );
@@ -42,7 +42,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, THTensor_sizeLegacyNoScalars(input, 0),
+        1, input->size(0),
         reduction == Reduction::ElementwiseMean,
         margin
       );
@@ -52,7 +52,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   else if (input->dim() == 2)
   {
     int nframe = input->size(0);
-    THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
+    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3,
                "inconsistent target size");
     dim3 blocks(input->size(0));
     dim3 threads(MULTIMARGIN_THREADS);
@@ -149,7 +149,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   if(weights)
     weights = THCTensor_(newContiguous)(state, weights);
 
-  if (THTensor_nDimensionLegacyNoScalars(input) == 1)
+  if (input->dim() == 1)
   {
     dim3 blocks(1);
     dim3 threads(MULTIMARGIN_THREADS);
@@ -162,7 +162,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, THTensor_sizeLegacyNoScalars(gradInput, 0),
+        1, gradInput->size(0),
         reduction == Reduction::ElementwiseMean,
         margin,
         reduction != Reduction::None
@@ -176,7 +176,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
         THCTensor_(data)(state, input),
         THCIndexTensor_(data)(state, target),
         weights ? THCTensor_(data)(state, weights) : NULL,
-        1, THTensor_sizeLegacyNoScalars(gradInput, 0),
+        1, gradInput->size(0),
         reduction == Reduction::ElementwiseMean,
         margin,
         reduction != Reduction::None
@@ -187,7 +187,7 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   else if (input->dim() == 2)
   {
     int nframe = gradInput->size(0);
-    THArgCheck(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe), 3,
+    THArgCheck(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe), 3,
                "inconsistent target size");
     dim3 blocks(gradInput->size(0));
     dim3 threads(MULTIMARGIN_THREADS);
diff --git a/aten/src/THCUNN/generic/PReLU.cu b/aten/src/THCUNN/generic/PReLU.cu
index 2a0d719ff6a3e6..2517b409409aed 100644
--- a/aten/src/THCUNN/generic/PReLU.cu
+++ b/aten/src/THCUNN/generic/PReLU.cu
@@ -24,8 +24,8 @@ void THNN_(PReLU_updateOutput)(
     input = THCTensor_(newContiguous)(state, input);
 
     int n = THCTensor_(nElement)(state, input);
-    if (THTensor_sizeLegacyNoScalars(input, ndim > 1) != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, ndim > 1));
+    if (input->size(ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(ndim > 1));
 
     int mapSize = 1;
     for (int d = 2; d < ndim; d++) {
@@ -69,8 +69,8 @@ void THNN_(PReLU_updateGradInput)(
     gradOutput = THCTensor_(newContiguous)(state, gradOutput);
 
     int n = THCTensor_(nElement)(state, input);
-    if (THTensor_sizeLegacyNoScalars(input, ndim > 1) != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, ndim > 1));
+    if (input->size(ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(ndim > 1));
 
     int mapSize = 1;
     for (int d = 2; d < ndim; d++) {
diff --git a/aten/src/THCUNN/generic/SparseLinear.cu b/aten/src/THCUNN/generic/SparseLinear.cu
index 0363dcf0e3996a..f73bd5835c04bb 100644
--- a/aten/src/THCUNN/generic/SparseLinear.cu
+++ b/aten/src/THCUNN/generic/SparseLinear.cu
@@ -4,17 +4,17 @@
 
 static bool THNN_(checkInput)(THCTensor* t)
 {
-  return !t->is_empty() && t->dim() == 2 && t->size(1) == 3;
+  return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 2 && t->size(1) == 3;
 }
 
 static bool THNN_(checkSize2D)(THCTensor* t, int64_t size0, int64_t size1)
 {
-  return !t->is_empty() && t->dim() == 2 && t->size(0) == size0 && t->size(1) == size1;
+  return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 2 && t->size(0) == size0 && t->size(1) == size1;
 }
 
 static bool THNN_(checkSize1D)(THCTensor* t, int64_t size0)
 {
-  return !t->is_empty() && THTensor_nDimensionLegacyNoScalars(t) == 1 && THTensor_sizeLegacyNoScalars(t, 0) == size0;
+  return !t->is_empty() && THTensor_nDimensionLegacyAll(t) == 1 && t->size(0) == size0;
 }
 
 static inline void THNN_(copyCudaFloatingType)(THCState *state, THCudaIntTensor *buf, THCTensor *t) {
diff --git a/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu
index ae211774a580db..b7010977558816 100644
--- a/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu
+++ b/aten/src/THCUNN/generic/SpatialClassNLLCriterion.cu
@@ -8,10 +8,10 @@ void THNN_(SpatialClassNLLCriterion_shapeCheck)(
            THCIndexTensor *target,
            THCTensor *weights)
 {
-  AT_CHECK(!target->is_empty() && target->dim() == 3, 1,
+  AT_CHECK(!target->is_empty() && THCIndexTensor_(nDimensionLegacyNoScalars)(state, target) == 3, 1,
            "only batches of spatial targets supported (non-empty 3D tensors)" \
            " but got targets of size: : ", target->sizes());
-  AT_CHECK(!input->is_empty() && input->dim() == 4, 2,
+  AT_CHECK(!input->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, input) == 4, 2,
            "only batches of spatial inputs supported (non-empty 4D tensors), "      \
            "but got input of size: ", input->sizes());
   if (THCTensor_(size)(state, input, 0) != THCIndexTensor_(size)(state, target, 0) ||
diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
index 7860404b685f52..334afe93cb727e 100644
--- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
+++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
@@ -73,7 +73,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
       int64_t nOutputPlane = weight->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
+      int64_t nOutputPlane = bias->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     }
     THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
diff --git a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
index 546ec2ae3c6185..7c6716c41f5bff 100644
--- a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
+++ b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
@@ -31,7 +31,7 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)(
 
   // Bias has same # of channels as output
   if (bias) {
-    THAssert(THTensor_sizeLegacyNoScalars(bias, 0) == weight->size(0));
+    THAssert(bias->size(0) == weight->size(0));
   }
 
   input = THCTensor_(newContiguous)(state, input);
diff --git a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu
index 4225583735460e..ad0f47418b86cf 100644
--- a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu
+++ b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu
@@ -65,7 +65,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
       int64_t nOutputPlane = weight->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
+      int64_t nOutputPlane = bias->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     }
      THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
diff --git a/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu
index 8d039d54068aaf..76777796e361e4 100644
--- a/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu
+++ b/aten/src/THCUNN/generic/SpatialFullDilatedConvolution.cu
@@ -65,7 +65,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
       int64_t nOutputPlane = weight->size(1);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
+      int64_t nOutputPlane = bias->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     }
     THCUNN_check_dim_size(state, gradOutput, ndim, dimh, outputHeight);
@@ -351,7 +351,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
   if (gradWeight != NULL) {
     nOutputPlane = THCTensor_(size)(state, gradWeight, 1);
   } else if (gradBias != NULL) {
-    nOutputPlane = THCTensor_(sizeLegacyNoScalars)(state, gradBias, 0);
+    nOutputPlane = THCTensor_(size)(state, gradBias, 0);
   } else {
     return;
   }
diff --git a/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu b/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu
new file mode 100644
index 00000000000000..7e285cb55fa7d2
--- /dev/null
+++ b/aten/src/THCUNN/generic/SpatialGridSamplerBilinear.cu
@@ -0,0 +1,97 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialGridSamplerBilinear.cu"
+#else
+
+static inline void THNN_(SpatialGridSamplerBilinear_shapeCheck)(
+    THCState *state,
+    THCTensor *input,
+    THCTensor *grid,
+    THCTensor *gradOutput) {
+  THCUNN_argCheck(state, !input->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, input) == 4, 2, input,
+      "non-empty 4D input tensor expected but got: %s");
+  THCUNN_argCheck(state, !grid->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, grid) == 4, 2, grid,
+      "4D grid tensor expected but got: %s");
+
+  int64_t nbatch   = THCTensor_(size)(state, input, 0);
+  int64_t channels = THCTensor_(size)(state, input, 1);
+  int64_t iheight   = THCTensor_(size)(state, input, 2);
+  int64_t iwidth    = THCTensor_(size)(state, input, 3);
+  int64_t oheight   = THCTensor_(size)(state, grid, 1);
+  int64_t owidth    = THCTensor_(size)(state, grid, 2);
+
+  THCUNN_check_dim_size(state, grid, 4, 0, nbatch);
+  THCUNN_check_dim_size(state, grid, 4, 3, 2);
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, 4, 0, nbatch);
+    THCUNN_check_dim_size(state, gradOutput, 4, 1, channels);
+    THCUNN_check_dim_size(state, gradOutput, 4, 2, oheight);
+    THCUNN_check_dim_size(state, gradOutput, 4, 3, owidth);
+  }
+}
+
+THC_API void THNN_(SpatialGridSamplerBilinear_updateOutput)(
+    THCState *state,
+    THCTensor *input,
+    THCTensor *grid,
+    THCTensor *output,
+    int padding_mode) {
+
+  THCUNN_assertSameGPU(state, 3, input, grid, output);
+  THNN_(SpatialGridSamplerBilinear_shapeCheck)(state, input, grid, NULL);
+  int64_t N = THCTensor_(size)(state, input, 0);
+  int64_t C = THCTensor_(size)(state, input, 1);
+  int64_t IH = THCTensor_(size)(state, input, 2);
+  int64_t IW = THCTensor_(size)(state, input, 3);
+  int64_t H = THCTensor_(size)(state,grid, 1);
+  int64_t W = THCTensor_(size)(state, grid, 2);
+
+  // resize output to the same shape as input
+  THCTensor_(resize4d)(state, output, N, C, H, W);
+
+  THCDeviceTensor<real, 4> devInput = toDeviceTensor<real, 4>(state, input);
+  THCDeviceTensor<real, 4> devGrid = toDeviceTensor<real, 4>(state, grid);
+  THCDeviceTensor<real, 4> devOutput = toDeviceTensor<real, 4>(state, output);
+
+  int count = static_cast<int>(N*H*W);
+  SpatialGridSamplerBilinear_updateOutput_kernel
+    <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      count, devInput, devGrid, devOutput, padding_mode);
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)(
+    THCState *state,
+    THCTensor *input, THCTensor *gradInput,
+    THCTensor *grid, THCTensor *gradGrid,
+    THCTensor *gradOutput,
+    int padding_mode) {
+
+  THCUNN_assertSameGPU(state, 5, input, gradInput, grid, gradGrid, gradOutput);
+  THNN_(SpatialGridSamplerBilinear_shapeCheck)(state, input, grid, gradOutput);
+  int64_t N = THCTensor_(size)(state, input, 0);
+  int64_t C = THCTensor_(size)(state, input, 1);
+  int64_t IH = THCTensor_(size)(state, input, 2);
+  int64_t IW = THCTensor_(size)(state, input, 3);
+  int64_t H = THCTensor_(size)(state, grid, 1);
+  int64_t W = THCTensor_(size)(state, grid, 2);
+
+  THCTensor_(resize4d)(state, gradInput, N, C, IH, IW);
+  THCTensor_(resize4d)(state, gradGrid, N, H, W, 2);
+  THCTensor_(zero)(state, gradInput);
+  THCTensor_(zero)(state, gradGrid);
+
+  THCDeviceTensor<real, 4> devInput = toDeviceTensor<real, 4>(state, input);
+  THCDeviceTensor<real, 4> devGradInput = toDeviceTensor<real, 4>(state, gradInput);
+  THCDeviceTensor<real, 4> devGrid = toDeviceTensor<real, 4>(state, grid);
+  THCDeviceTensor<real, 4> devGradGrid = toDeviceTensor<real, 4>(state, gradGrid);
+  THCDeviceTensor<real, 4> devGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+
+  int count = static_cast<int>(N*H*W);
+  SpatialGridSamplerBilinear_updateGradInput_kernel
+    <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      count, devInput, devGradInput, devGrid, devGradGrid, devGradOutput, padding_mode);
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THCUNN/generic/THCUNN.h b/aten/src/THCUNN/generic/THCUNN.h
index 3c4883a1e3c45d..eaadf66c8306ee 100644
--- a/aten/src/THCUNN/generic/THCUNN.h
+++ b/aten/src/THCUNN/generic/THCUNN.h
@@ -119,7 +119,6 @@ THC_API void THNN_(ELU_updateOutput)(
                   THCTensor *output,
                   accreal alpha,
                   accreal scale,
-                  accreal input_scale,
                   bool inplace);
 
 THC_API void THNN_(ELU_updateGradInput)(
@@ -128,8 +127,7 @@ THC_API void THNN_(ELU_updateGradInput)(
                   THCTensor *gradInput,
                   THCTensor *output,
                   accreal alpha,
-                  accreal scale,
-                  accreal input_scale);
+                  accreal scale);
 
 THC_API void THNN_(FeatureLPPooling_updateOutput)(
                   THCState* state,
@@ -1047,6 +1045,34 @@ THC_API void THNN_(SpatialUpSamplingNearest_updateOutput)(
                   int outputHeight,
                   int outputWidth);
 
+THC_API void THNN_(SpatialGridSamplerBilinear_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *grid,
+                  THCTensor *output,
+                  int padding_mode);
+
+THC_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input, THCTensor *gradInput,
+                  THCTensor *grid, THCTensor *gradGrid,
+                  THCTensor *gradOutput,
+                  int padding_mode);
+
+THC_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *grid,
+                  THCTensor *output,
+                  int padding_mode);
+
+THC_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input, THCTensor *gradInput,
+                  THCTensor *grid, THCTensor *gradGrid,
+                  THCTensor *gradOutput,
+                  int padding_mode);
+
 THC_API void THNN_(RReLU_updateOutput)(
                   THCState *state,
                   THCTensor *input,
diff --git a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu
index 310f22d03e5dfa..870d38ba225f8c 100644
--- a/aten/src/THCUNN/generic/TemporalReflectionPadding.cu
+++ b/aten/src/THCUNN/generic/TemporalReflectionPadding.cu
@@ -79,7 +79,7 @@ void THNN_(TemporalReflectionPadding_updateGradInput)(
   int planeDim = 0;
   int dimw = 1;
 
-  int numInputDims = input->dim();
+  int numInputDims = THCTensor_(nDimensionLegacyNoScalars)(state, input);
   if (numInputDims == 3) {
     planeDim++;
     dimw++;
diff --git a/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu
index d6ffba3519553c..52d97fbf2a3638 100644
--- a/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu
+++ b/aten/src/THCUNN/generic/VolumetricDilatedConvolution.cu
@@ -75,7 +75,7 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
       int64_t nOutputPlane = weight->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
+      int64_t nOutputPlane = bias->size(0);
       THCUNN_check_dim_size(state, gradOutput, ndim, dimf, nOutputPlane);
     }
     THCUNN_check_dim_size(state, gradOutput, ndim, dimd, outputDepth);
diff --git a/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu
index 10a5fdc2643193..96310609e956f4 100644
--- a/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu
+++ b/aten/src/THCUNN/generic/VolumetricFullDilatedConvolution.cu
@@ -387,7 +387,7 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
   if (gradWeight) {
     nOutputPlane = THCTensor_(size)(state, gradWeight, 1);
   } else if (gradBias) {
-    nOutputPlane = THCTensor_(sizeLegacyNoScalars)(state, gradBias, 0);
+    nOutputPlane = THCTensor_(size)(state, gradBias, 0);
   } else {
     return;
   }
diff --git a/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu b/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu
new file mode 100644
index 00000000000000..086667ca476ac1
--- /dev/null
+++ b/aten/src/THCUNN/generic/VolumetricGridSamplerBilinear.cu
@@ -0,0 +1,104 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricGridSamplerBilinear.cu"
+#else
+
+static inline void THNN_(VolumetricGridSamplerBilinear_shapeCheck)(
+    THCState *state,
+    THCTensor *input,
+    THCTensor *grid,
+    THCTensor *gradOutput) {
+  THCUNN_argCheck(state, !input->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, input) == 5, 2, input,
+      "non-empty 5D input tensor expected but got: %s");
+  THCUNN_argCheck(state, !grid->is_empty() && THCTensor_(nDimensionLegacyNoScalars)(state, grid) == 5, 2, grid,
+      "non-empty 5D grid tensor expected but got: %s");
+
+  int64_t nbatch   = THCTensor_(size)(state, input, 0);
+  int64_t channels = THCTensor_(size)(state, input, 1);
+  int64_t idepth   = THCTensor_(size)(state, input, 2);
+  int64_t iheight   = THCTensor_(size)(state, input, 3);
+  int64_t iwidth    = THCTensor_(size)(state, input, 4);
+  int64_t odepth   = THCTensor_(size)(state, grid, 1);
+  int64_t oheight   = THCTensor_(size)(state, grid, 2);
+  int64_t owidth    = THCTensor_(size)(state, grid, 3);
+
+  THCUNN_check_dim_size(state, grid, 5, 0, nbatch);
+  THCUNN_check_dim_size(state, grid, 5, 4, 3);
+
+  if (gradOutput != NULL) {
+    THCUNN_check_dim_size(state, gradOutput, 5, 0, nbatch);
+    THCUNN_check_dim_size(state, gradOutput, 5, 1, channels);
+    THCUNN_check_dim_size(state, gradOutput, 5, 2, odepth);
+    THCUNN_check_dim_size(state, gradOutput, 5, 3, oheight);
+    THCUNN_check_dim_size(state, gradOutput, 5, 4, owidth);
+  }
+}
+
+THC_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)(
+    THCState *state,
+    THCTensor *input,
+    THCTensor *grid,
+    THCTensor *output,
+    int padding_mode) {
+
+  THCUNN_assertSameGPU(state, 3, input, grid, output);
+  THNN_(VolumetricGridSamplerBilinear_shapeCheck)(state, input, grid, NULL);
+  int64_t N = THCTensor_(size)(state, input, 0);
+  int64_t C = THCTensor_(size)(state, input, 1);
+  int64_t ID = THCTensor_(size)(state, input, 2);
+  int64_t IH = THCTensor_(size)(state, input, 3);
+  int64_t IW = THCTensor_(size)(state, input, 4);
+  int64_t D = THCTensor_(size)(state,grid, 1);
+  int64_t H = THCTensor_(size)(state,grid, 2);
+  int64_t W = THCTensor_(size)(state, grid, 3);
+
+  // resize output to the same shape as input
+  THCTensor_(resize5d)(state, output, N, C, D, H, W);
+
+  THCDeviceTensor<real, 5> devInput = toDeviceTensor<real, 5>(state, input);
+  THCDeviceTensor<real, 5> devGrid = toDeviceTensor<real, 5>(state, grid);
+  THCDeviceTensor<real, 5> devOutput = toDeviceTensor<real, 5>(state, output);
+
+  int count = static_cast<int>(N*D*H*W);
+  VolumetricGridSamplerBilinear_updateOutput_kernel
+    <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      count, devInput, devGrid, devOutput, padding_mode);
+  THCudaCheck(cudaGetLastError());
+}
+
+THC_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)(
+    THCState *state,
+    THCTensor *input, THCTensor *gradInput,
+    THCTensor *grid, THCTensor *gradGrid,
+    THCTensor *gradOutput,
+    int padding_mode) {
+
+  THCUNN_assertSameGPU(state, 5, input, gradInput, grid, gradGrid, gradOutput);
+  THNN_(VolumetricGridSamplerBilinear_shapeCheck)(state, input, grid, gradOutput);
+  int64_t N = THCTensor_(size)(state, input, 0);
+  int64_t C = THCTensor_(size)(state, input, 1);
+  int64_t ID = THCTensor_(size)(state, input, 2);
+  int64_t IH = THCTensor_(size)(state, input, 3);
+  int64_t IW = THCTensor_(size)(state, input, 4);
+  int64_t D = THCTensor_(size)(state,grid, 1);
+  int64_t H = THCTensor_(size)(state,grid, 2);
+  int64_t W = THCTensor_(size)(state, grid, 3);
+
+  THCTensor_(resize5d)(state, gradInput, N, C, ID, IH, IW);
+  THCTensor_(resize5d)(state, gradGrid, N, D, H, W, 3);
+  THCTensor_(zero)(state, gradInput);
+  THCTensor_(zero)(state, gradGrid);
+
+  THCDeviceTensor<real, 5> devInput = toDeviceTensor<real, 5>(state, input);
+  THCDeviceTensor<real, 5> devGradInput = toDeviceTensor<real, 5>(state, gradInput);
+  THCDeviceTensor<real, 5> devGrid = toDeviceTensor<real, 5>(state, grid);
+  THCDeviceTensor<real, 5> devGradGrid = toDeviceTensor<real, 5>(state, gradGrid);
+  THCDeviceTensor<real, 5> devGradOutput = toDeviceTensor<real, 5>(state, gradOutput);
+
+  int count = static_cast<int>(N*D*H*W);
+  VolumetricGridSamplerBilinear_updateGradInput_kernel
+    <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      count, devInput, devGradInput, devGrid, devGradGrid, devGradOutput, padding_mode);
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/aten/src/THNN/generic/ClassNLLCriterion.c b/aten/src/THNN/generic/ClassNLLCriterion.c
index 7db0531d60d1ef..c7d42b583374cc 100644
--- a/aten/src/THNN/generic/ClassNLLCriterion.c
+++ b/aten/src/THNN/generic/ClassNLLCriterion.c
@@ -82,7 +82,7 @@ void THNN_(ClassNLLCriterion_updateOutput)(
     }
   } else if (THTensor_(nDimensionLegacyAll)(input) == 2) {
     int batch_size = THTensor_(size)(input, 0);
-    THAssert(THTensor_sizeLegacyNoScalars(target, 0) == batch_size);
+    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
 
     int n_target = THTensor_(size)(input, 1);
 
@@ -189,7 +189,7 @@ void THNN_(ClassNLLCriterion_updateGradInput)(
 
   } else if (THTensor_(nDimensionLegacyAll)(input) == 2) {
     int batch_size = THTensor_(size)(input, 0);
-    THAssert(THTensor_sizeLegacyNoScalars(target, 0) == batch_size);
+    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
 
     int n_target = THTensor_(size)(input, 1);
 
diff --git a/aten/src/THNN/generic/ELU.c b/aten/src/THNN/generic/ELU.c
index 62111ebbf4d7c2..f2d87185b813a5 100644
--- a/aten/src/THNN/generic/ELU.c
+++ b/aten/src/THNN/generic/ELU.c
@@ -8,21 +8,19 @@ void THNN_(ELU_updateOutput)(
           THTensor *output,
           accreal alpha_,
           accreal scale,
-          accreal input_scale,
           bool inplace)
 {
   real negcoef = TH_CONVERT_ACCREAL_TO_REAL(alpha_ * scale);
-  real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale * input_scale);
-  real negiptcoef = TH_CONVERT_ACCREAL_TO_REAL(input_scale);
+  real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale);
   if (inplace) {
     TH_TENSOR_APPLY(real, input,
-      *input_data = *input_data <= 0 ? (exp(*input_data * negiptcoef)-1) * negcoef : *input_data * poscoef;
+      *input_data = *input_data <= 0 ? (exp(*input_data)-1) * negcoef : *input_data * poscoef;
     );
     THTensor_(set)(output, input);
   } else {
     THTensor_(resizeAs)(output, input);
     TH_TENSOR_APPLY2(real, input, real, output,
-      *output_data = *input_data <= 0 ? (exp(*input_data * negiptcoef)-1) * negcoef : *input_data * poscoef;
+      *output_data = *input_data <= 0 ? (exp(*input_data)-1) * negcoef : *input_data * poscoef;
     );
   }
 }
@@ -33,16 +31,14 @@ void THNN_(ELU_updateGradInput)(
           THTensor *gradInput,
           THTensor *output,
           accreal alpha_,
-          accreal scale,
-          accreal input_scale)
+          accreal scale)
 {
   real negcoef = TH_CONVERT_ACCREAL_TO_REAL(alpha_ * scale);
-  real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale * input_scale);
-  real negiptcoef = TH_CONVERT_ACCREAL_TO_REAL(input_scale);
+  real poscoef = TH_CONVERT_ACCREAL_TO_REAL(scale);
   THNN_CHECK_NELEMENT(output, gradOutput);
   THTensor_(resizeAs)(gradInput, output);
   TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
-    *gradInput_data = *output_data <= 0 ? *gradOutput_data * negiptcoef * (*output_data + negcoef) : *gradOutput_data * poscoef;
+    *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + negcoef) : *gradOutput_data * poscoef;
   );
 }
 
diff --git a/aten/src/THNN/generic/GatedLinearUnit.c b/aten/src/THNN/generic/GatedLinearUnit.c
index 0f888744240473..68cdc37d54214a 100644
--- a/aten/src/THNN/generic/GatedLinearUnit.c
+++ b/aten/src/THNN/generic/GatedLinearUnit.c
@@ -10,7 +10,7 @@ void THNN_(GatedLinear_updateOutput)(
 {
   // size output to half of input
   dim = dim - TH_INDEX_BASE;
-  const int64_t nIn = THTensor_sizeLegacyNoScalars(input, dim);
+  const int64_t nIn = THTensor_(size)(input, dim);
   THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
       dim + TH_INDEX_BASE, nIn);
 
diff --git a/aten/src/THNN/generic/LookupTable.c b/aten/src/THNN/generic/LookupTable.c
index fa6648e2a6b80c..2260b168d8e8d5 100644
--- a/aten/src/THNN/generic/LookupTable.c
+++ b/aten/src/THNN/generic/LookupTable.c
@@ -40,7 +40,7 @@ void THNN_(LookupTable_accGradParameters)(
 
   if (scaleGradByFreq)
   {
-    THIntegerTensor_(resize1d)(count, THTensor_sizeLegacyNoScalars(gradWeight, 0));
+    THIntegerTensor_(resize1d)(count, gradWeight->size(0));
     count_data = THIntegerTensor_(data)(count);
   }
 
diff --git a/aten/src/THNN/generic/MultiLabelMarginCriterion.c b/aten/src/THNN/generic/MultiLabelMarginCriterion.c
index a18252b06914d6..0699c3ac471c55 100644
--- a/aten/src/THNN/generic/MultiLabelMarginCriterion.c
+++ b/aten/src/THNN/generic/MultiLabelMarginCriterion.c
@@ -17,14 +17,14 @@ void THNN_(MultiLabelMarginCriterion_updateOutput)(
   int64_t t, d, dt, ddt;
   real sum;
 
-  AT_CHECK(!input->is_empty() && input->dim() <= 2,
+  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
            "non-empty vector or matrix expected, got size: ", input->sizes());
 
-  if (input->dim() <= 1)
+  if (input->dim() == 1)
   {
     nframe = 1;
-    dim = THTensor_sizeLegacyNoScalars(input, 0);
-    AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim),
+    dim = input->size(0);
+    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == dim),
              "inconsistent target size");
   }
   else
@@ -155,16 +155,16 @@ void THNN_(MultiLabelMarginCriterion_updateGradInput)(
   int64_t t, d, dt;
   real g;
 
-  AT_CHECK(!input->is_empty() && input->dim() <= 2,
+  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
            "vector or matrix expected, got size: ", input->sizes());
 
-  if (input->dim() <= 1)
+  if (input->dim() == 1)
   {
     nframe = 1;
-    dim = THTensor_sizeLegacyNoScalars(input, 0);
-    AT_CHECK((!target->is_empty() && target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == dim),
+    dim = input->size(0);
+    AT_CHECK((!target->is_empty() && target->dim() == 1) && (target->size(0) == dim),
              "inconsistent target size");
-    AT_CHECK((!isTarget->is_empty() && isTarget->dim() <= 1) && (THTensor_sizeLegacyNoScalars(isTarget, 0) == dim),
+    AT_CHECK((!isTarget->is_empty() && isTarget->dim() == 1) && (isTarget->size(0) == dim),
              "inconsistent isTarget size");
   }
   else
diff --git a/aten/src/THNN/generic/MultiMarginCriterion.c b/aten/src/THNN/generic/MultiMarginCriterion.c
index 2c8f38be23eb3a..424669e5de8515 100644
--- a/aten/src/THNN/generic/MultiMarginCriterion.c
+++ b/aten/src/THNN/generic/MultiMarginCriterion.c
@@ -20,19 +20,19 @@ void THNN_(MultiMarginCriterion_updateOutput)(
   int64_t t, d;
   real sum;
 
-  AT_CHECK(!input->is_empty() && input->dim() <= 2,
+  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
            "non-empty vector or matrix expected, got size: ", input->sizes());
 
-  if (input->dim() <= 1)
+  if (input->dim() == 1)
   {
     nframe = 1;
-    dim = THTensor_sizeLegacyNoScalars(input, 0);
+    dim = input->size(0);
   }
   else
   {
     nframe = input->size(0);
     dim = input->size(1);
-    AT_CHECK(!target->is_empty() && (THTensor_nDimensionLegacyNoScalars(target) == 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe),
+    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe),
              "inconsistent target size, got: ", target->sizes());
   }
 
@@ -136,19 +136,19 @@ void THNN_(MultiMarginCriterion_updateGradInput)(
   int64_t t, d;
   real g;
 
-  AT_CHECK(!input->is_empty() && (input->dim() <= 2),
+  AT_CHECK(!input->is_empty() && (input->dim() == 1 || input->dim() == 2),
            "non-empty vector or matrix expected, got size: ", input->sizes());
 
-  if (input->dim() <= 1)
+  if (input->dim() == 1)
   {
     nframe = 1;
-    dim = THTensor_sizeLegacyNoScalars(input, 0);
+    dim = input->size(0);
   }
   else
   {
     nframe = input->size(0);
     dim = input->size(1);
-    AT_CHECK(!target->is_empty() && (target->dim() <= 1) && (THTensor_sizeLegacyNoScalars(target, 0) == nframe),
+    AT_CHECK(!target->is_empty() && (target->dim() == 1) && (target->size(0) == nframe),
              "inconsistent target size, got: ", target->sizes());
   }
 
diff --git a/aten/src/THNN/generic/PReLU.c b/aten/src/THNN/generic/PReLU.c
index 1837874852d2bb..e148fde783ce9d 100644
--- a/aten/src/THNN/generic/PReLU.c
+++ b/aten/src/THNN/generic/PReLU.c
@@ -26,8 +26,8 @@ void THNN_(PReLU_updateOutput)(
   int64_t bs = 1, ks = 1;
   {
     int64_t input_ndim = THTensor_(nDimensionLegacyAll)(input);
-    if (THTensor_sizeLegacyNoScalars(input, input_ndim > 1) != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, input_ndim > 1));
+    if (input->size(input_ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1));
 
     if (input_ndim > 1) {
         bs = input->size(0);
@@ -91,8 +91,8 @@ void THNN_(PReLU_updateGradInput)(
   int64_t bs = 1, ks = 1;
   {
     int64_t input_ndim = THTensor_(nDimensionLegacyAll)(input);
-    if (THTensor_sizeLegacyNoScalars(input, input_ndim > 1) != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, input_ndim > 1));
+    if (input->size(input_ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1));
 
     if (input_ndim > 1) {
         bs = input->size(0);
@@ -162,8 +162,8 @@ void THNN_(PReLU_accGradParameters)(
   int64_t bs = 1, ks = 1;
   {
     int64_t input_ndim = THTensor_(nDimensionLegacyAll)(input);
-    if (THTensor_sizeLegacyNoScalars(input, input_ndim > 1) != nOutputPlane)
-      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, THTensor_sizeLegacyNoScalars(input, input_ndim > 1));
+    if (input->size(input_ndim > 1) != nOutputPlane)
+      THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size(input_ndim > 1));
 
     if (input_ndim > 1) {
         bs = input->size(0);
diff --git a/aten/src/THNN/generic/SparseLinear.c b/aten/src/THNN/generic/SparseLinear.c
index 3bf8e652fa9ed9..a28d4e78477ceb 100644
--- a/aten/src/THNN/generic/SparseLinear.c
+++ b/aten/src/THNN/generic/SparseLinear.c
@@ -26,7 +26,7 @@ static bool THNN_(checkSize2D)(THTensor* t, int64_t size0, int64_t size1)
 
 static bool THNN_(checkSize1D)(THTensor* t, int64_t size0)
 {
-  return !t->is_empty() && THTensor_nDimensionLegacyNoScalars(t) == 1 && THTensor_sizeLegacyNoScalars(t, 0) == size0;
+  return !t->is_empty() && t->dim() == 1 && t->size(0) == size0;
 }
 
 static void THNN_(set1d)(THTensor *t, int64_t x0, real value) {
diff --git a/aten/src/THNN/generic/SpatialConvolutionMM.c b/aten/src/THNN/generic/SpatialConvolutionMM.c
index f18a6d0817059b..fce2c8575935a5 100644
--- a/aten/src/THNN/generic/SpatialConvolutionMM.c
+++ b/aten/src/THNN/generic/SpatialConvolutionMM.c
@@ -72,7 +72,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
       int64_t nOutputPlane = weight->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
+      int64_t nOutputPlane = bias->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
@@ -332,7 +332,7 @@ static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
   }
 
   if (gradBias) {
-    for(i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++)
+    for(i = 0; i < gradBias->size(0); i++)
     {
       int64_t k;
       real sum = 0;
diff --git a/aten/src/THNN/generic/SpatialDilatedConvolution.c b/aten/src/THNN/generic/SpatialDilatedConvolution.c
index 2f71861963fcdf..63e7bd81033e12 100644
--- a/aten/src/THNN/generic/SpatialDilatedConvolution.c
+++ b/aten/src/THNN/generic/SpatialDilatedConvolution.c
@@ -64,7 +64,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
       int64_t nOutputPlane = weight->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
+      int64_t nOutputPlane = bias->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
diff --git a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
index eeb644fc9eb5e6..7226db67ef1a74 100644
--- a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
+++ b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
@@ -64,7 +64,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
       int64_t nOutputPlane = weight->size(1);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
+      int64_t nOutputPlane = bias->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
@@ -332,7 +332,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
   if (gradWeight) {
     nOutputPlane = THTensor_(size)(gradWeight, 1);
   } else if (gradBias) {
-    nOutputPlane = THTensor_sizeLegacyNoScalars(gradBias, 0);
+    nOutputPlane = THTensor_(size)(gradBias, 0);
   } else {
     return;
   }
@@ -402,7 +402,7 @@ void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
       // M,N,K are dims of matrix A and B
       // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
       int64_t n = columns->size(0);   // nOutputPlane * kh * kw
-      int64_t m = THTensor_sizeLegacyNoScalars(input_n, 0);   // nInputPlane
+      int64_t m = input_n->size(0);   // nInputPlane
       int64_t k = columns->size(1);   // inputHeight * inputWidth
 
       // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
diff --git a/aten/src/THNN/generic/SpatialGridSamplerBilinear.c b/aten/src/THNN/generic/SpatialGridSamplerBilinear.c
new file mode 100644
index 00000000000000..d31f3e0a76c20a
--- /dev/null
+++ b/aten/src/THNN/generic/SpatialGridSamplerBilinear.c
@@ -0,0 +1,250 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialGridSamplerBilinear.c"
+#else
+
+#undef MIN
+#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
+#undef MAX
+#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
+
+#undef MODE_BORDER
+#define MODE_BORDER 1
+
+static inline void THNN_(SpatialGridSamplerBilinear_shapeCheck)
+     (THTensor *input, THTensor *grid, THTensor *gradOutput) {
+  THNN_ARGCHECK(!input->is_empty() && input->dim() == 4, 2, input,
+    "non-empty 4D input tensor expected but got: %s");
+  THNN_ARGCHECK(!grid->is_empty() && grid->dim() == 4, 2, grid,
+    "non-empty 4D grid tensor expected but got: %s");
+
+  int nbatch   = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int oheight   = THTensor_(size)(grid, 1);
+  int owidth    = THTensor_(size)(grid, 2);
+
+  THNN_CHECK_DIM_SIZE(grid, 4, 0, nbatch);
+  THNN_CHECK_DIM_SIZE(grid, 4, 3, 2);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nbatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, channels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, oheight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, owidth);
+  }
+}
+
+#define SAFE_GET(input, x, y, n, c, H, W) x >= 0 && x < W && y >=0 \
+    && y < H ? THTensor_(fastGet4d)(input, n, c, y, x) : 0
+
+#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0))
+
+TH_API void THNN_(SpatialGridSamplerBilinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *grid,
+    THTensor *output,
+    int padding_mode) {
+
+  THNN_(SpatialGridSamplerBilinear_shapeCheck)(input, grid, NULL);
+  int N = THTensor_(size)(input, 0);
+  int C = THTensor_(size)(input, 1);
+  int IH = THTensor_(size)(input, 2);
+  int IW = THTensor_(size)(input, 3);
+  int H = THTensor_(size)(grid, 1);
+  int W = THTensor_(size)(grid, 2);
+
+  // resize output to the same shape as input
+  THTensor_(resize4d)(output, N, C, H, W);
+
+  // loop over each output pixel
+  int n, h, w, c;
+#pragma omp parallel for private(n, h, w, c)
+  for (n = 0; n < N; ++n) {
+    for (h = 0; h < H; ++h) {
+      for (w = 0; w < W; ++w) {
+        // get the corresponding input x, y co-ordinates from grid
+        real ix = THTensor_(fastGet4d)(grid, n, h, w, 0);
+        real iy = THTensor_(fastGet4d)(grid, n, h, w, 1);
+
+        // normalize ix, iy from [-1, 1] to [0, IH-1] & [0, IW-1]
+        ix = ((ix + 1) / 2) * (IW-1);
+        iy = ((iy + 1) / 2) * (IH-1);
+
+        // get NE, NW, SE, SW pixel values from (x, y)
+        int ix_nw = floor(ix);
+        int iy_nw = floor(iy);
+        int ix_ne = ix_nw + 1;
+        int iy_ne = iy_nw;
+        int ix_sw = ix_nw;
+        int iy_sw = iy_nw + 1;
+        int ix_se = ix_nw + 1;
+        int iy_se = iy_nw + 1;
+
+        // get surfaces to each neighbor:
+        real nw = (ix_se - ix)    * (iy_se - iy);
+        real ne = (ix    - ix_sw) * (iy_sw - iy);
+        real sw = (ix_ne - ix)    * (iy    - iy_ne);
+        real se = (ix    - ix_nw) * (iy    - iy_nw);
+
+        if (padding_mode==MODE_BORDER){
+          // clip coordinates to image borders
+          CLIP_COORDINATES(ix_nw, ix_nw, IW);
+          CLIP_COORDINATES(iy_nw, iy_nw, IH);
+          CLIP_COORDINATES(ix_ne, ix_ne, IW);
+          CLIP_COORDINATES(iy_ne, iy_ne, IH);
+          CLIP_COORDINATES(ix_sw, ix_sw, IW);
+          CLIP_COORDINATES(iy_sw, iy_sw, IH);
+          CLIP_COORDINATES(ix_se, ix_se, IW);
+          CLIP_COORDINATES(iy_se, iy_se, IH);
+        }
+
+        // calculate bilinear weighted pixel value and set output pixel
+        for (c = 0; c < C; ++c) {
+          //   (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne
+          // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se
+          real nw_val = SAFE_GET(input, ix_nw, iy_nw, n, c, IH, IW);
+          real ne_val = SAFE_GET(input, ix_ne, iy_ne, n, c, IH, IW);
+          real sw_val = SAFE_GET(input, ix_sw, iy_sw, n, c, IH, IW);
+          real se_val = SAFE_GET(input, ix_se, iy_se, n, c, IH, IW);
+          real out_val = nw_val * nw + ne_val * ne + sw_val * sw + se_val * se;
+          THTensor_(fastSet4d)(output, n, c, h, w, out_val);
+        }
+      }
+    }
+  }
+}
+
+#define SAFE_ADD(input, x, y, n, c, H, W, value)    \
+  do {                \
+    if (x >= 0 && x < W && y >=0 && y < H) {      \
+      real old_value = THTensor_(fastGet4d)(input, n, c, y, x); \
+      THTensor_(fastSet4d)(input, n, c, y, x, value + old_value); \
+    }               \
+  } while(0)
+
+TH_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)(
+    THNNState *state,
+    THTensor *input, THTensor *gradInput,
+    THTensor *grid, THTensor *gradGrid,
+    THTensor *gradOutput,
+    int padding_mode) {
+
+  THNN_(SpatialGridSamplerBilinear_shapeCheck)(input, grid, gradOutput);
+  int N = THTensor_(size)(input, 0);
+  int C = THTensor_(size)(input, 1);
+  int IH = THTensor_(size)(input, 2);
+  int IW = THTensor_(size)(input, 3);
+  int H = THTensor_(size)(grid, 1);
+  int W = THTensor_(size)(grid, 2);
+
+  THTensor_(resize4d)(gradInput, N, C, IH, IW);
+  THTensor_(resize4d)(gradGrid, N, H, W, 2);
+  THTensor_(zero)(gradInput);
+  THTensor_(zero)(gradGrid);
+
+  // loop over each output pixel
+  int n, h, w;
+#pragma omp parallel for private(n, h, w)
+  for (n = 0; n < N; ++n) {
+    for (h = 0; h < H; ++h) {
+      for (w = 0; w < W; ++w) {
+        // get the corresponding input x, y co-ordinates from grid
+        real ix = THTensor_(fastGet4d)(grid, n, h, w, 0);
+        real iy = THTensor_(fastGet4d)(grid, n, h, w, 1);
+
+        real gix = 0;
+        real giy = 0;
+
+        // normalize ix, iy from [-1, 1] to [0, H-1] & [0, W-1]
+        ix = ((ix + 1) / 2) * (IW-1);
+        iy = ((iy + 1) / 2) * (IH-1);
+
+        // get NE, NW, SE, SW pixel values from (x, y)
+        int ix_nw = floor(ix);
+        int iy_nw = floor(iy);
+        int ix_ne = ix_nw + 1;
+        int iy_ne = iy_nw;
+        int ix_sw = ix_nw;
+        int iy_sw = iy_nw + 1;
+        int ix_se = ix_nw + 1;
+        int iy_se = iy_nw + 1;
+
+        // get surfaces to each neighbor:
+        real nw = (ix_se - ix)    * (iy_se - iy);
+        real ne = (ix    - ix_sw) * (iy_sw - iy);
+        real sw = (ix_ne - ix)    * (iy    - iy_ne);
+        real se = (ix    - ix_nw) * (iy    - iy_nw);
+
+        int ix_nw_cl, iy_nw_cl, ix_ne_cl, iy_ne_cl, ix_sw_cl, iy_sw_cl, ix_se_cl, iy_se_cl;
+
+        if (padding_mode==MODE_BORDER){
+          // get clipped NE, NW, SE, SW pixel values from (x, y)
+          CLIP_COORDINATES(ix_nw, ix_nw_cl, IW);
+          CLIP_COORDINATES(iy_nw, iy_nw_cl, IH);
+          CLIP_COORDINATES(ix_ne, ix_ne_cl, IW);
+          CLIP_COORDINATES(iy_ne, iy_ne_cl, IH);
+          CLIP_COORDINATES(ix_sw, ix_sw_cl, IW);
+          CLIP_COORDINATES(iy_sw, iy_sw_cl, IH);
+          CLIP_COORDINATES(ix_se, ix_se_cl, IW);
+          CLIP_COORDINATES(iy_se, iy_se_cl, IH);
+        }
+        else {
+          ix_nw_cl = ix_nw;
+          iy_nw_cl = iy_nw;
+          ix_ne_cl = ix_ne;
+          iy_ne_cl = iy_ne;
+          ix_sw_cl = ix_sw;
+          iy_sw_cl = iy_sw;
+          ix_se_cl = ix_se;
+          iy_se_cl = iy_se;
+        }
+
+        for (int c = 0; c < C; ++c) {
+          real gradout = THTensor_(fastGet4d)(gradOutput, n, c, h, w);
+
+          // calculate and set gradInput
+          SAFE_ADD(gradInput, ix_nw_cl, iy_nw_cl, n, c, IH, IW, nw * gradout);
+          SAFE_ADD(gradInput, ix_ne_cl, iy_ne_cl, n, c, IH, IW, ne * gradout);
+          SAFE_ADD(gradInput, ix_sw_cl, iy_sw_cl, n, c, IH, IW, sw * gradout);
+          SAFE_ADD(gradInput, ix_se_cl, iy_se_cl, n, c, IH, IW, se * gradout);
+
+          // calculate gradGrid
+          real nw_val = SAFE_GET(input, ix_nw_cl, iy_nw_cl, n, c, IH, IW);
+          real ne_val = SAFE_GET(input, ix_ne_cl, iy_ne_cl, n, c, IH, IW);
+          real sw_val = SAFE_GET(input, ix_sw_cl, iy_sw_cl, n, c, IH, IW);
+          real se_val = SAFE_GET(input, ix_se_cl, iy_se_cl, n, c, IH, IW);
+
+          gix -= nw_val * (iy_se - iy) * gradout;
+          gix += ne_val * (iy_sw - iy) * gradout;
+          gix -= sw_val * (iy - iy_ne) * gradout;
+          gix += se_val * (iy - iy_nw) * gradout;
+
+          giy -= nw_val * (ix_se - ix) * gradout;
+          giy -= ne_val * (ix - ix_sw) * gradout;
+          giy += sw_val * (ix_ne - ix) * gradout;
+          giy += se_val * (ix - ix_nw) * gradout;
+        }
+
+        // un-normalize gradGrid values back to [-1, 1] constraints
+        gix = gix * (IW - 1) / 2;
+        giy = giy * (IH - 1) / 2;
+
+        real gix_old = THTensor_(fastGet4d)(gradGrid, n, h, w, 0);
+        real giy_old = THTensor_(fastGet4d)(gradGrid, n, h, w, 1);
+
+        THTensor_(fastSet4d)(gradGrid, n, h, w, 0, gix_old + gix);
+        THTensor_(fastSet4d)(gradGrid, n, h, w, 1, giy_old + giy);
+      }
+    }
+  }
+}
+
+
+#undef MIN
+#undef MAX
+#undef SAFE_GET
+#undef CLIP_COORDINATES
+#undef SAFE_ADD
+#undef MODE_BORDER
+
+#endif
diff --git a/aten/src/THNN/generic/THNN.h b/aten/src/THNN/generic/THNN.h
index 1d7a9176553756..455da04c7e4454 100644
--- a/aten/src/THNN/generic/THNN.h
+++ b/aten/src/THNN/generic/THNN.h
@@ -90,8 +90,7 @@ TH_API void THNN_(ELU_updateOutput)(
           THTensor *input,             // input tensor
           THTensor *output,            // [OUT] ELU output
           accreal alpha,               // an ELU parameter (as in paper)
-          accreal scale,               // scaling factor for output
-          accreal input_scale,         // scaling factor for input
+          accreal scale,               // scaling factor
           bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
 TH_API void THNN_(ELU_updateGradInput)(
           THNNState *state,            // library's state
@@ -99,8 +98,7 @@ TH_API void THNN_(ELU_updateGradInput)(
           THTensor *gradInput,         // [OUT] gradient w.r.t. input
           THTensor *output,            // output from a forward pass
           accreal alpha,               // an ELU parameter (as in paper)
-          accreal scale,
-          accreal input_scale);
+          accreal scale);
 
 TH_API void THNN_(DistKLDivCriterion_updateOutput)(
           THNNState *state,            // library's state
@@ -1229,6 +1227,34 @@ TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
           int osizeW,
           bool align_corners);
 
+TH_API void THNN_(SpatialGridSamplerBilinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *grid,
+          THTensor *output,
+          int padding_mode);
+
+TH_API void THNN_(SpatialGridSamplerBilinear_updateGradInput)(
+          THNNState *state,
+          THTensor *input, THTensor *gradInput,
+          THTensor *grid, THTensor *gradGrid,
+          THTensor *gradOutput,
+          int padding_mode);
+
+TH_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *grid,
+          THTensor *output,
+          int padding_mode);
+
+TH_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)(
+          THNNState *state,
+          THTensor *input, THTensor *gradInput,
+          THTensor *grid, THTensor *gradGrid,
+          THTensor *gradOutput,
+          int padding_mode);
+
 TH_API void THNN_(unfolded_acc)(
           THTensor *finput,
           THTensor *input,
diff --git a/aten/src/THNN/generic/TemporalRowConvolution.c b/aten/src/THNN/generic/TemporalRowConvolution.c
index e7b51ec194c402..b623e5a2ad7fd4 100644
--- a/aten/src/THNN/generic/TemporalRowConvolution.c
+++ b/aten/src/THNN/generic/TemporalRowConvolution.c
@@ -38,7 +38,7 @@ static inline void THNN_(TemporalRowConvolution_shapeCheck)(
 	THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input,
 	              "non-empty 2D or 3D (batch mode) input tensor expected, but got :%s");
 
-	int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
+	int64_t inputFrameSize = weight->size(0);
 	int64_t nInputFrame = input->size(dimS);
 	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
 
@@ -197,7 +197,7 @@ void THNN_(TemporalRowConvolution_updateOutput)(
 	THNN_(TemporalRowConvolution_shapeCheck)(
 		state, input, NULL, weight, bias, kW, dW, padW);
 
-	int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
+	int64_t inputFrameSize = weight->size(0);
 	int64_t nInputFrame = input->size(ndim - 1);
 	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
 
@@ -311,7 +311,7 @@ void THNN_(TemporalRowConvolution_updateGradInput)(
 	THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
 	                                         NULL, kW, dW, padW);
 
-	int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
+	int64_t inputFrameSize = weight->size(0);
 	int64_t nInputFrame = input->size(ndim - 1);
 	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
 
@@ -386,7 +386,7 @@ static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
     THTensor_(free)(tfinput);
 
 	if (gradBias != NULL) {
-		for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) {
+		for (i = 0; i < gradBias->size(0); i++) {
 			int64_t k;
 			real sum = 0;
 			real *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput3d))
diff --git a/aten/src/THNN/generic/VolumetricConvolution.c b/aten/src/THNN/generic/VolumetricConvolution.c
index c979edf71f8f4c..4b74445e047705 100644
--- a/aten/src/THNN/generic/VolumetricConvolution.c
+++ b/aten/src/THNN/generic/VolumetricConvolution.c
@@ -51,7 +51,7 @@ void THNN_(VolumetricConvolution_updateOutput)(
 
     /* add bias */
     if (bias) {
-      for (i = 0; i < THTensor_sizeLegacyNoScalars(bias, 0); i++)
+      for (i = 0; i < bias->size(0); i++)
       {
         THTensor_(select)(outn, output, 0, i);
         THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
@@ -78,7 +78,7 @@ void THNN_(VolumetricConvolution_updateOutput)(
 
       /* add bias */
       if (bias) {
-        for (i = 0; i < THTensor_sizeLegacyNoScalars(bias, 0); i++)
+        for (i = 0; i < bias->size(0); i++)
         {
           THTensor_(select)(outn, outb, 0, i);
           THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
@@ -117,7 +117,7 @@ void THNN_(VolumetricConvolution_updateGradInput)(
 		"non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
 		"expected for weight, but got: %s");
 
-  int nOutputPlane = (int)THTensor_sizeLegacyNoScalars(weight, 0);
+  int nOutputPlane = (int)weight->size(0);
 
   THNN_ARGCHECK(!gradOutput->is_empty() && (gradOutput->dim() == 4 || gradOutput->dim() == 5), 3,
 		gradOutput,
@@ -187,9 +187,9 @@ void THNN_(VolumetricConvolution_accGradParameters)(
 		"non-empty 5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
 		"expected for gradWeight, but got: %s");
 
-  int nOutputPlane = (int)THTensor_sizeLegacyNoScalars(gradWeight, 0);
+  int nOutputPlane = (int)gradWeight->size(0);
   if (gradBias) {
-    THArgCheck(!gradBias->is_empty() && THTensor_nDimensionLegacyNoScalars(gradBias) == 1 && THTensor_sizeLegacyNoScalars(gradBias, 0) == nOutputPlane, 5,
+    THArgCheck(!gradBias->is_empty() && gradBias->dim() == 1 && gradBias->size(0) == nOutputPlane, 5,
       "gradBias tensor has wrong size"
     );
   }
diff --git a/aten/src/THNN/generic/VolumetricConvolutionMM.c b/aten/src/THNN/generic/VolumetricConvolutionMM.c
index 209d1575dacbec..14d98a79dd29b8 100644
--- a/aten/src/THNN/generic/VolumetricConvolutionMM.c
+++ b/aten/src/THNN/generic/VolumetricConvolutionMM.c
@@ -102,7 +102,7 @@ static void inline THNN_(VolumetricConvolutionMM_shapeCheck)(
       int64_t nOutputPlane = weight->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
+      int64_t nOutputPlane = bias->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, outputDepth);
@@ -691,7 +691,7 @@ static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
   }
 
   if (gradBias) {
-    for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++)
+    for (i = 0; i < gradBias->size(0); i++)
     {
       int64_t k;
       real sum = 0;
diff --git a/aten/src/THNN/generic/VolumetricDilatedConvolution.c b/aten/src/THNN/generic/VolumetricDilatedConvolution.c
index c9fa19f0adf488..8222c534612fd5 100644
--- a/aten/src/THNN/generic/VolumetricDilatedConvolution.c
+++ b/aten/src/THNN/generic/VolumetricDilatedConvolution.c
@@ -69,7 +69,7 @@ static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
       int64_t nOutputPlane = weight->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
+      int64_t nOutputPlane = bias->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
diff --git a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c
index 16dedeffb9c58f..4cc4dcc69837d8 100644
--- a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c
+++ b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c
@@ -154,7 +154,7 @@ static inline void THNN_(VolumetricFullDilatedConvolution_shapeCheck)(
       const int64_t nOutputPlane = weight->size(1);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     } else if (bias != NULL) {
-      const int64_t nOutputPlane = THTensor_sizeLegacyNoScalars(bias, 0);
+      const int64_t nOutputPlane = bias->size(0);
       THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
     }
     THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
@@ -441,7 +441,7 @@ void THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
   if (gradWeight) {
     nOutputPlane = THTensor_(size)(gradWeight, 1);
   } else if (gradBias) {
-    nOutputPlane = THTensor_sizeLegacyNoScalars(gradBias, 0);
+    nOutputPlane = THTensor_(size)(gradBias, 0);
   } else {
     return;
   }
diff --git a/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c b/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c
new file mode 100644
index 00000000000000..4d7ace422d4e97
--- /dev/null
+++ b/aten/src/THNN/generic/VolumetricGridSamplerBilinear.c
@@ -0,0 +1,409 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricGridSamplerBilinear.c"
+#else
+
+#undef MIN
+#define MIN(a,b) ( ((a)<(b)) ? (a) : (b) )
+#undef MAX
+#define MAX(a,b) ( ((a)>(b)) ? (a) : (b) )
+
+#undef MODE_BORDER
+#define MODE_BORDER 1
+
+static inline void THNN_(VolumetricGridSamplerBilinear_shapeCheck)
+     (THTensor *input, THTensor *grid, THTensor *gradOutput) {
+  THNN_ARGCHECK(!input->is_empty() && input->dim() == 5, 2, input,
+    "non-empty 5D input tensor expected but got: %s");
+  THNN_ARGCHECK(!grid->is_empty() && grid->dim() == 5, 2, grid,
+    "non-empty 5D grid tensor expected but got: %s");
+
+  int nbatch   = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int odepth    = THTensor_(size)(grid, 1);
+  int oheight   = THTensor_(size)(grid, 2);
+  int owidth    = THTensor_(size)(grid, 3);
+
+  THNN_CHECK_DIM_SIZE(grid, 5, 0, nbatch);
+  THNN_CHECK_DIM_SIZE(grid, 5, 4, 3);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nbatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, channels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, odepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, oheight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, owidth);
+  }
+}
+
+#define SAFE_GET(input, x, y, z, n, c, D, H, W) \
+  x >= 0 && x < W && y >=0 && y < H && z >= 0 && z < D \
+    ? THTensor_(fastGet5d)(input, n, c, z, y, x) : 0
+
+#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit-1), MAX(in, 0))
+
+TH_API void THNN_(VolumetricGridSamplerBilinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *grid,
+    THTensor *output,
+    int padding_mode) {
+
+  THNN_(VolumetricGridSamplerBilinear_shapeCheck)(input, grid, NULL);
+  int N = THTensor_(size)(input, 0);
+  int C = THTensor_(size)(input, 1);
+  int ID = THTensor_(size)(input, 2);
+  int IH = THTensor_(size)(input, 3);
+  int IW = THTensor_(size)(input, 4);
+  int D = THTensor_(size)(grid, 1);
+  int H = THTensor_(size)(grid, 2);
+  int W = THTensor_(size)(grid, 3);
+
+  // resize output to the same shape as input
+  THTensor_(resize5d)(output, N, C, D, H, W);
+
+  // loop over each output pixel
+  int n, d, h, w, c;
+#pragma omp parallel for private(n, d, h, w, c)
+  for (n = 0; n < N; ++n) {
+    for (d = 0; d < D; ++d) {
+      for (h = 0; h < H; ++h) {
+        for (w = 0; w < W; ++w) {
+          // get the corresponding input x, y, z co-ordinates from grid
+          real ix = THTensor_(fastGet5d)(grid, n, d, h, w, 0);
+          real iy = THTensor_(fastGet5d)(grid, n, d, h, w, 1);
+          real iz = THTensor_(fastGet5d)(grid, n, d, h, w, 2);
+
+          // normalize ix, iy, iz from [-1, 1] to [0, IW-1] & [0, IH-1] & [0, ID-1]
+          ix = ((ix + 1) / 2) * (IW-1);
+          iy = ((iy + 1) / 2) * (IH-1);
+          iz = ((iz + 1) / 2) * (ID-1);
+
+          // get corner pixel values from (x, y, z)
+          // for 4d, we used north-east-south-west
+          // for 5d, we add top-bottom
+          int ix_tnw = floor(ix);
+          int iy_tnw = floor(iy);
+          int iz_tnw = floor(iz);
+
+          int ix_tne = ix_tnw + 1;
+          int iy_tne = iy_tnw;
+          int iz_tne = iz_tnw;
+
+          int ix_tsw = ix_tnw;
+          int iy_tsw = iy_tnw + 1;
+          int iz_tsw = iz_tnw;
+
+          int ix_tse = ix_tnw + 1;
+          int iy_tse = iy_tnw + 1;
+          int iz_tse = iz_tnw;
+
+          int ix_bnw = ix_tnw;
+          int iy_bnw = iy_tnw;
+          int iz_bnw = iz_tnw + 1;
+
+          int ix_bne = ix_tnw + 1;
+          int iy_bne = iy_tnw;
+          int iz_bne = iz_tnw + 1;
+
+          int ix_bsw = ix_tnw;
+          int iy_bsw = iy_tnw + 1;
+          int iz_bsw = iz_tnw + 1;
+
+          int ix_bse = ix_tnw + 1;
+          int iy_bse = iy_tnw + 1;
+          int iz_bse = iz_tnw + 1;
+
+          // get surfaces to each neighbor:
+          real tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+          real tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+          real tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+          real tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+          real bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+          real bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+          real bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+          real bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+
+          if (padding_mode==MODE_BORDER){
+            // clip coordinates to image borders
+            CLIP_COORDINATES(ix_tnw, ix_tnw, IW);
+            CLIP_COORDINATES(iy_tnw, iy_tnw, IH);
+            CLIP_COORDINATES(iz_tnw, iz_tnw, ID);
+            CLIP_COORDINATES(ix_tne, ix_tne, IW);
+            CLIP_COORDINATES(iy_tne, iy_tne, IH);
+            CLIP_COORDINATES(iz_tne, iz_tne, ID);
+            CLIP_COORDINATES(ix_tsw, ix_tsw, IW);
+            CLIP_COORDINATES(iy_tsw, iy_tsw, IH);
+            CLIP_COORDINATES(iz_tsw, iz_tsw, ID);
+            CLIP_COORDINATES(ix_tse, ix_tse, IW);
+            CLIP_COORDINATES(iy_tse, iy_tse, IH);
+            CLIP_COORDINATES(iz_tse, iz_tse, ID);
+            CLIP_COORDINATES(ix_bnw, ix_bnw, IW);
+            CLIP_COORDINATES(iy_bnw, iy_bnw, IH);
+            CLIP_COORDINATES(iz_bnw, iz_bnw, ID);
+            CLIP_COORDINATES(ix_bne, ix_bne, IW);
+            CLIP_COORDINATES(iy_bne, iy_bne, IH);
+            CLIP_COORDINATES(iz_bne, iz_bne, ID);
+            CLIP_COORDINATES(ix_bsw, ix_bsw, IW);
+            CLIP_COORDINATES(iy_bsw, iy_bsw, IH);
+            CLIP_COORDINATES(iz_bsw, iz_bsw, ID);
+            CLIP_COORDINATES(ix_bse, ix_bse, IW);
+            CLIP_COORDINATES(iy_bse, iy_bse, IH);
+            CLIP_COORDINATES(iz_bse, iz_bse, ID);
+          }
+
+          // calculate bilinear weighted pixel value and set output pixel
+          for (c = 0; c < C; ++c) {
+            //   (c, iy_nw, ix_nw) * nw + (c, iy_ne, ix_ne) * ne
+            // + (c, iy_sw, ix_sw) * sw + (c, iy_se, ix_se) * se
+            real tnw_val = SAFE_GET(input, ix_tnw, iy_tnw, iz_tnw, n, c, ID, IH, IW);
+            real tne_val = SAFE_GET(input, ix_tne, iy_tne, iz_tne, n, c, ID, IH, IW);
+            real tsw_val = SAFE_GET(input, ix_tsw, iy_tsw, iz_tsw, n, c, ID, IH, IW);
+            real tse_val = SAFE_GET(input, ix_tse, iy_tse, iz_tse, n, c, ID, IH, IW);
+            real bnw_val = SAFE_GET(input, ix_bnw, iy_bnw, iz_bnw, n, c, ID, IH, IW);
+            real bne_val = SAFE_GET(input, ix_bne, iy_bne, iz_bne, n, c, ID, IH, IW);
+            real bsw_val = SAFE_GET(input, ix_bsw, iy_bsw, iz_bsw, n, c, ID, IH, IW);
+            real bse_val = SAFE_GET(input, ix_bse, iy_bse, iz_bse, n, c, ID, IH, IW);
+            real out_val = tnw_val * tnw + tne_val * tne + tsw_val * tsw + tse_val * tse +
+              bnw_val * bnw + bne_val * bne + bsw_val * bsw + bse_val * bse;
+            THTensor_(fastSet5d)(output, n, c, d, h, w, out_val);
+          }
+        }
+      }
+    }
+  }
+}
+
+#define SAFE_ADD(input, x, y, z, n, c, D, H, W, value)  \
+  do {                                                                  \
+    if (x >= 0 && x < W && y >=0 && y < H && z >=0 && z < D) {          \
+      real old_value = THTensor_(fastGet5d)(input, n, c, z, y, x);        \
+      THTensor_(fastSet5d)(input, n, c, z, y, x, value + old_value);      \
+    }                                                                   \
+  } while(0)
+
+TH_API void THNN_(VolumetricGridSamplerBilinear_updateGradInput)(
+    THNNState *state,
+    THTensor *input, THTensor *gradInput,
+    THTensor *grid, THTensor *gradGrid,
+    THTensor *gradOutput,
+    int padding_mode) {
+
+  THNN_(VolumetricGridSamplerBilinear_shapeCheck)(input, grid, gradOutput);
+  int N = THTensor_(size)(input, 0);
+  int C = THTensor_(size)(input, 1);
+  int ID = THTensor_(size)(input, 2);
+  int IH = THTensor_(size)(input, 3);
+  int IW = THTensor_(size)(input, 4);
+  int D = THTensor_(size)(grid, 1);
+  int H = THTensor_(size)(grid, 2);
+  int W = THTensor_(size)(grid, 3);
+
+  THTensor_(resize5d)(gradInput, N, C, ID, IH, IW);
+  THTensor_(resize5d)(gradGrid, N, D, H, W, 3);
+  THTensor_(zero)(gradInput);
+  THTensor_(zero)(gradGrid);
+
+  // loop over each output pixel
+  int n, d, h, w;
+//#pragma omp parallel for private(n, d, h, w)
+  for (n = 0; n < N; ++n) {
+    for (d = 0; d < D; ++d) {
+      for (h = 0; h < H; ++h) {
+        for (w = 0; w < W; ++w) {
+          // get the corresponding input x, y, z co-ordinates from grid
+          real ix = THTensor_(fastGet5d)(grid, n, d, h, w, 0);
+          real iy = THTensor_(fastGet5d)(grid, n, d, h, w, 1);
+          real iz = THTensor_(fastGet5d)(grid, n, d, h, w, 2);
+
+          real gix = 0;
+          real giy = 0;
+          real giz = 0;
+
+          // normalize ix, iy, iz from [-1, 1] to [0, W-1] & [0, H-1] & [0, D-1]
+          ix = ((ix + 1) / 2) * (IW-1);
+          iy = ((iy + 1) / 2) * (IH-1);
+          iz = ((iz + 1) / 2) * (ID-1);
+
+          // get corner pixel values from (x, y, z)
+          // for 4d, we used north-east-south-west
+          // for 5d, we add top-bottom
+          int ix_tnw = floor(ix);
+          int iy_tnw = floor(iy);
+          int iz_tnw = floor(iz);
+
+          int ix_tne = ix_tnw + 1;
+          int iy_tne = iy_tnw;
+          int iz_tne = iz_tnw;
+
+          int ix_tsw = ix_tnw;
+          int iy_tsw = iy_tnw + 1;
+          int iz_tsw = iz_tnw;
+
+          int ix_tse = ix_tnw + 1;
+          int iy_tse = iy_tnw + 1;
+          int iz_tse = iz_tnw;
+
+          int ix_bnw = ix_tnw;
+          int iy_bnw = iy_tnw;
+          int iz_bnw = iz_tnw + 1;
+
+          int ix_bne = ix_tnw + 1;
+          int iy_bne = iy_tnw;
+          int iz_bne = iz_tnw + 1;
+
+          int ix_bsw = ix_tnw;
+          int iy_bsw = iy_tnw + 1;
+          int iz_bsw = iz_tnw + 1;
+
+          int ix_bse = ix_tnw + 1;
+          int iy_bse = iy_tnw + 1;
+          int iz_bse = iz_tnw + 1;
+
+          // get surfaces to each neighbor:
+          real tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+          real tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+          real tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+          real tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+          real bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+          real bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+          real bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+          real bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+
+          int ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, ix_tne_cl, iy_tne_cl, iz_tne_cl;
+          int ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, ix_tse_cl, iy_tse_cl, iz_tse_cl;
+          int ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, ix_bne_cl, iy_bne_cl, iz_bne_cl;
+          int ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, ix_bse_cl, iy_bse_cl, iz_bse_cl;
+
+          if (padding_mode==MODE_BORDER){
+            // clip coordinates to image borders
+            CLIP_COORDINATES(ix_tnw, ix_tnw_cl, IW);
+            CLIP_COORDINATES(iy_tnw, iy_tnw_cl, IH);
+            CLIP_COORDINATES(iz_tnw, iz_tnw_cl, ID);
+            CLIP_COORDINATES(ix_tne, ix_tne_cl, IW);
+            CLIP_COORDINATES(iy_tne, iy_tne_cl, IH);
+            CLIP_COORDINATES(iz_tne, iz_tne_cl, ID);
+            CLIP_COORDINATES(ix_tsw, ix_tsw_cl, IW);
+            CLIP_COORDINATES(iy_tsw, iy_tsw_cl, IH);
+            CLIP_COORDINATES(iz_tsw, iz_tsw_cl, ID);
+            CLIP_COORDINATES(ix_tse, ix_tse_cl, IW);
+            CLIP_COORDINATES(iy_tse, iy_tse_cl, IH);
+            CLIP_COORDINATES(iz_tse, iz_tse_cl, ID);
+            CLIP_COORDINATES(ix_bnw, ix_bnw_cl, IW);
+            CLIP_COORDINATES(iy_bnw, iy_bnw_cl, IH);
+            CLIP_COORDINATES(iz_bnw, iz_bnw_cl, ID);
+            CLIP_COORDINATES(ix_bne, ix_bne_cl, IW);
+            CLIP_COORDINATES(iy_bne, iy_bne_cl, IH);
+            CLIP_COORDINATES(iz_bne, iz_bne_cl, ID);
+            CLIP_COORDINATES(ix_bsw, ix_bsw_cl, IW);
+            CLIP_COORDINATES(iy_bsw, iy_bsw_cl, IH);
+            CLIP_COORDINATES(iz_bsw, iz_bsw_cl, ID);
+            CLIP_COORDINATES(ix_bse, ix_bse_cl, IW);
+            CLIP_COORDINATES(iy_bse, iy_bse_cl, IH);
+            CLIP_COORDINATES(iz_bse, iz_bse_cl, ID);
+          }
+          else {
+            ix_tnw_cl = ix_tnw;
+            iy_tnw_cl = iy_tnw;
+            iz_tnw_cl = iz_tnw;
+            ix_tne_cl = ix_tne;
+            iy_tne_cl = iy_tne;
+            iz_tne_cl = iz_tne;
+            ix_tsw_cl = ix_tsw;
+            iy_tsw_cl = iy_tsw;
+            iz_tsw_cl = iz_tsw;
+            ix_tse_cl = ix_tse;
+            iy_tse_cl = iy_tse;
+            iz_tse_cl = iz_tse;
+            ix_bnw_cl = ix_bnw;
+            iy_bnw_cl = iy_bnw;
+            iz_bnw_cl = iz_bnw;
+            ix_bne_cl = ix_bne;
+            iy_bne_cl = iy_bne;
+            iz_bne_cl = iz_bne;
+            ix_bsw_cl = ix_bsw;
+            iy_bsw_cl = iy_bsw;
+            iz_bsw_cl = iz_bsw;
+            ix_bse_cl = ix_bse;
+            iy_bse_cl = iy_bse;
+            iz_bse_cl = iz_bse;
+          }
+
+          for (int c = 0; c < C; ++c) {
+            real gradout = THTensor_(fastGet5d)(gradOutput, n, c, d, h, w);
+
+            // calculate and set gradInput
+            SAFE_ADD(gradInput, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW, tnw * gradout);
+            SAFE_ADD(gradInput, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW, tne * gradout);
+            SAFE_ADD(gradInput, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW, tsw * gradout);
+            SAFE_ADD(gradInput, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW, tse * gradout);
+            SAFE_ADD(gradInput, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW, bnw * gradout);
+            SAFE_ADD(gradInput, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW, bne * gradout);
+            SAFE_ADD(gradInput, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW, bsw * gradout);
+            SAFE_ADD(gradInput, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW, bse * gradout);
+
+            // calculate gradGrid
+            real tnw_val = SAFE_GET(input, ix_tnw_cl, iy_tnw_cl, iz_tnw_cl, n, c, ID, IH, IW);
+            real tne_val = SAFE_GET(input, ix_tne_cl, iy_tne_cl, iz_tne_cl, n, c, ID, IH, IW);
+            real tsw_val = SAFE_GET(input, ix_tsw_cl, iy_tsw_cl, iz_tsw_cl, n, c, ID, IH, IW);
+            real tse_val = SAFE_GET(input, ix_tse_cl, iy_tse_cl, iz_tse_cl, n, c, ID, IH, IW);
+            real bnw_val = SAFE_GET(input, ix_bnw_cl, iy_bnw_cl, iz_bnw_cl, n, c, ID, IH, IW);
+            real bne_val = SAFE_GET(input, ix_bne_cl, iy_bne_cl, iz_bne_cl, n, c, ID, IH, IW);
+            real bsw_val = SAFE_GET(input, ix_bsw_cl, iy_bsw_cl, iz_bsw_cl, n, c, ID, IH, IW);
+            real bse_val = SAFE_GET(input, ix_bse_cl, iy_bse_cl, iz_bse_cl, n, c, ID, IH, IW);
+
+            gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gradout;
+            gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gradout;
+            gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gradout;
+            gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gradout;
+            gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gradout;
+            gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gradout;
+            gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gradout;
+            gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gradout;
+
+
+            giy -= tnw_val * (ix_bse - ix)    * (iz_bse - iz) * gradout;
+            giy -= tne_val * (ix    - ix_bsw) * (iz_bsw - iz) * gradout;
+            giy += tsw_val * (ix_bne - ix)    * (iz_bne - iz) * gradout;
+            giy += tse_val * (ix    - ix_bnw) * (iz_bnw - iz) * gradout;
+            giy -= bnw_val * (ix_tse - ix)    * (iz - iz_tse) * gradout;
+            giy -= bne_val * (ix    - ix_tsw) * (iz - iz_tsw) * gradout;
+            giy += bsw_val * (ix_tne - ix)    * (iz - iz_tne) * gradout;
+            giy += bse_val * (ix    - ix_tnw) * (iz - iz_tnw) * gradout;
+
+            giz -= tnw_val * (ix_bse - ix)    * (iy_bse - iy)    * gradout;
+            giz -= tne_val * (ix    - ix_bsw) * (iy_bsw - iy)    * gradout;
+            giz -= tsw_val * (ix_bne - ix)    * (iy    - iy_bne) * gradout;
+            giz -= tse_val * (ix    - ix_bnw) * (iy    - iy_bnw) * gradout;
+            giz += bnw_val * (ix_tse - ix)    * (iy_tse - iy)    * gradout;
+            giz += bne_val * (ix    - ix_tsw) * (iy_tsw - iy)    * gradout;
+            giz += bsw_val * (ix_tne - ix)    * (iy    - iy_tne) * gradout;
+            giz += bse_val * (ix    - ix_tnw) * (iy    - iy_tnw) * gradout;
+
+          }
+
+          // un-normalize gradGrid values back to [-1, 1] constraints
+          gix = gix * (IW - 1) / 2;
+          giy = giy * (IH - 1) / 2;
+          giz = giz * (ID - 1) / 2;
+
+          real gix_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 0);
+          real giy_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 1);
+          real giz_old = THTensor_(fastGet5d)(gradGrid, n, d, h, w, 2);
+
+          THTensor_(fastSet5d)(gradGrid, n, d, h, w, 0, gix_old + gix);
+          THTensor_(fastSet5d)(gradGrid, n, d, h, w, 1, giy_old + giy);
+          THTensor_(fastSet5d)(gradGrid, n, d, h, w, 2, giz_old + giz);
+        }
+      }
+    }
+  }
+}
+
+#undef MIN
+#undef MAX
+#undef SAFE_GET
+#undef CLIP_COORDINATES
+#undef SAFE_ADD
+#undef MODE_BORDER
+
+#endif
diff --git a/aten/src/THNN/init.cpp b/aten/src/THNN/init.cpp
index c77cd76d54ec87..6c79f5be295b60 100644
--- a/aten/src/THNN/init.cpp
+++ b/aten/src/THNN/init.cpp
@@ -45,7 +45,7 @@
 
 #define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE)			\
   if (THTensor_(nDimensionLegacyNoScalars)(T) != DIM ||				\
-      THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) {				\
+      THTensor_(size)(T, DIM_SIZE) != SIZE) {				\
       THDescBuff s1 = THTensor_(sizeDesc)(T);				\
       THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
 	      " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
@@ -53,7 +53,7 @@
 
 #define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE)			\
   if (THIndexTensor_(nDimensionLegacyNoScalars)(T) != DIM ||				\
-      THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) {				\
+      THIndexTensor_(size)(T, DIM_SIZE) != SIZE) {				\
       THDescBuff s1 = THIndexTensor_(sizeDesc)(T);				\
       THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
         " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
@@ -245,6 +245,9 @@
 #include "generic/SpatialUpSamplingBilinear.c"
 #include "THGenerateFloatTypes.h"
 
+#include "generic/SpatialGridSamplerBilinear.c"
+#include "THGenerateFloatTypes.h"
+
 #include "generic/VolumetricAveragePooling.c"
 #include "THGenerateFloatTypes.h"
 
@@ -301,3 +304,6 @@
 
 #include "generic/VolumetricUpSamplingTrilinear.c"
 #include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricGridSamplerBilinear.c"
+#include "THGenerateFloatTypes.h"
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 588dae10e8e8e3..0d84ccbfb606a1 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -40,7 +40,6 @@ if(BUILD_ATEN)
   # ATen tests use catch instead of gtest so keep separate for now
   # list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS})
   # list(APPEND Caffe2_GPU_TEST_SRCS ${ATen_CUDA_TEST_SRCS})
-  list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS})
   list(APPEND Caffe2_CPU_INCLUDE ${ATen_CPU_INCLUDE})
   list(APPEND Caffe2_GPU_INCLUDE ${ATen_CUDA_INCLUDE})
   list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS})
@@ -52,15 +51,6 @@ if(BUILD_ATEN)
     set(Caffe2_HIP_SRCS ${ATen_CUDA_SRCS})
     set(Caffe2_HIP_INCLUDES ${Caffe2_HIP_INCLUDES} ${Caffe2_GPU_INCLUDE})
   ENDIF(USE_ROCM)
-else()
-  # Only add "ATen Core", a minimal, easy-to-compile fragment of ATen.
-  # This codepath should only be exercised by the Android build.
-  add_subdirectory(../aten/src/ATen/core ATen_core)
-  list(APPEND Caffe2_CPU_SRCS ${ATen_CORE_SRCS})
-  list(APPEND Caffe2_CPU_INCLUDE ${ATen_CORE_INCLUDE})
-  list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS})
-  # TODO: We should probably install the headers, but I don't know
-  # how to do that.
 endif()
 
 # ---[ Torch build
@@ -225,72 +215,6 @@ target_include_directories(caffe2 SYSTEM PRIVATE "${Caffe2_DEPENDENCY_INCLUDE}")
 aten_set_target_props(caffe2)
 target_compile_options(caffe2 INTERFACE "-std=c++11")
 target_compile_options(caffe2 PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
-if (MSVC AND NOT BUILD_SHARED_LIBS)
-  # Note [Supporting both static and dynamic libraries on Window]
-  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  # A Windows library may be distributed as either a static or dynamic
-  # library.  The chosen distribution mechanism affects how you setup
-  # the headers for the library: if you statically link a function,
-  # all you need is an ordinary signature:
-  #
-  #     void f();
-  #
-  # But if you *dynamically* link it, then you must provide a __declspec
-  # specifying that it should be imported from a DLL:
-  #
-  #     __declspec(dllimport) void f();
-  #
-  # Mixing the two situations will not work: if you specify dllimport
-  # while statically linking, the linker will complain it cannot find
-  # the __imp_f symbol (which serve as the DLL entrypoint); if you
-  # fail to specify dllimport for a symbol that's coming from a DLL,
-  # the linker will complain that it can't find f.  Joy!
-  #
-  # Most places on the Internet, you will find people have written
-  # their headers under the assumption that the application will
-  # only ever be dynamically linked, as they define a macro which
-  # tags a function as __declspec(dllexport) if you are actually
-  # building the library, and __declspec(dllimport) otherwise.  But
-  # if you want these headers to also work if you are linking against
-  # a static library, you need a way to avoid adding these __declspec's
-  # at all.  And that "mechanism" needs to apply to any downstream
-  # libraries/executables which are going to link against your library.
-  #
-  #   As an aside, why do we need to support both modes?
-  #   For historical reasons, PyTorch ATen on Windows is built dynamically,
-  #   while Caffe2 on Windows is built statically (mostly because if
-  #   we build it dynamically, we are over the DLL exported symbol limit--and
-  #   that is because Caffe2 hasn't comprehensively annotated all symbols
-  #   which cross the DLL boundary with CAFFE_API).  So any code
-  #   which is used by both PyTorch and Caffe2 needs to support both
-  #   modes of linking.
-  #
-  # So, you have a macro (call it AT_CORE_STATIC_WINDOWS) which you need to have
-  # set for any downstream library/executable that transitively includes your
-  # headers.  How are you going to do this?  You have two options:
-  #
-  #   1. Write out a config.h header which stores whether or not
-  #      you are linking statically or dynamically.
-  #
-  #   2. Force all of users to set the the macro themselves.  If they
-  #      use cmake, you can set -DAT_CORE_STATIC_WINDOWS=1 as a PUBLIC
-  #      compile option, in which case cmake will automatically
-  #      add the macro for you.
-  #
-  # Which one is better? Well, it depends: they trade off implementor
-  # ease versus user ease: (1) is more work for the library author
-  # but the user doesn't have to worry about it; (2) requires the user
-  # to set the macro themselves... but only if they don't use cmake.
-  #
-  # So, which is appropriate in our situation?  In my mind, here is
-  # the distinguishing factor: it is more common to distribute
-  # DLLs, since they don't require you to line up the CRT version
-  # (/MD, /MDd, /MT, /MTd) and MSVC version at the use site.  So,
-  # if a user is already in the business of static linkage, they're
-  # already in "expert user" realm.  So, I've decided that at this
-  # point in time, the simplicity of implementation of (2) wins out.
-  target_compile_options(caffe2 PUBLIC "-DAT_CORE_STATIC_WINDOWS=1")
-endif()
 # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression)
 target_compile_options(caffe2 PRIVATE "$<$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>:-O2>")
 install(TARGETS caffe2 EXPORT Caffe2Targets DESTINATION lib)
diff --git a/caffe2/contrib/aten/aten_op.cc b/caffe2/contrib/aten/aten_op.cc
index df3ee5326b7d90..bc93f4866ebc28 100644
--- a/caffe2/contrib/aten/aten_op.cc
+++ b/caffe2/contrib/aten/aten_op.cc
@@ -10,6 +10,7 @@ at::Backend ATenOp<CPUContext>::backend() const {
 }
 
 OPERATOR_SCHEMA(ATen);
+CAFFE_KNOWN_TYPE(at::Half);
 
 namespace math {
 template <>
diff --git a/caffe2/core/context.h b/caffe2/core/context.h
index fc3969879f30c4..f2831909e1587a 100644
--- a/caffe2/core/context.h
+++ b/caffe2/core/context.h
@@ -13,9 +13,6 @@
 #include "caffe2/core/typeid.h"
 #include "caffe2/proto/caffe2.pb.h"
 
-#include "ATen/core/ATenCoreTest.h"
-#include "ATen/core/ArrayRef.h"
-
 CAFFE2_DECLARE_bool(caffe2_report_cpu_memory_usage);
 
 namespace caffe2 {
diff --git a/caffe2/core/context_test.cc b/caffe2/core/context_test.cc
index 8924a9dc931be9..a6e44846e9e0be 100644
--- a/caffe2/core/context_test.cc
+++ b/caffe2/core/context_test.cc
@@ -6,11 +6,6 @@
 
 namespace caffe2 {
 
-TEST(CPUContextTest, ATenCoreTest) {
-  int i = at::CoreTest();
-  EXPECT_EQ(i + 1, at::CoreTest());
-}
-
 TEST(CPUContextTest, TestAllocAlignment) {
   for (int i = 1; i < 10; ++i) {
     auto data = CPUContext::New(i);
diff --git a/caffe2/core/dispatch/DeviceId.h b/caffe2/core/dispatch/DeviceId.h
index e5744ce1e1c2d6..e74a803557ea0d 100644
--- a/caffe2/core/dispatch/DeviceId.h
+++ b/caffe2/core/dispatch/DeviceId.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <ATen/core/C++17.h>
 #include <functional>
 #include <iostream>
+#include "caffe2/utils/C++17.h"
 
 namespace c10 {
 
diff --git a/caffe2/core/dispatch/LayoutId.h b/caffe2/core/dispatch/LayoutId.h
index 9ec44519b95a99..7f039fadfa9698 100644
--- a/caffe2/core/dispatch/LayoutId.h
+++ b/caffe2/core/dispatch/LayoutId.h
@@ -1,10 +1,10 @@
 #pragma once
 
-#include "ATen/core/IdWrapper.h"
+#include "caffe2/utils/IdWrapper.h"
 
 namespace c10 {
 
-class LayoutId final : public at::IdWrapper<LayoutId, uint8_t> {
+class LayoutId final : public c10::guts::IdWrapper<LayoutId, uint8_t> {
 public:
     constexpr explicit LayoutId(underlying_type id): IdWrapper(id) {}
 
@@ -19,4 +19,4 @@ class LayoutId final : public at::IdWrapper<LayoutId, uint8_t> {
 
 }
 
-AT_DEFINE_HASH_FOR_IDWRAPPER(c10::LayoutId)
+C10_DEFINE_HASH_FOR_IDWRAPPER(c10::LayoutId)
diff --git a/caffe2/core/dispatch/TensorTypeId.h b/caffe2/core/dispatch/TensorTypeId.h
index 244817904667b9..a80fc8377c8ca5 100644
--- a/caffe2/core/dispatch/TensorTypeId.h
+++ b/caffe2/core/dispatch/TensorTypeId.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "ATen/core/IdWrapper.h"
+#include "caffe2/utils/IdWrapper.h"
 #include <string>
 #include <iostream>
 #include <mutex>
@@ -21,7 +21,7 @@ namespace details {
 /**
  * Dynamic type ID of a Tensor argument.  It represents something like CPUTensor, etc.
  */
-class TensorTypeId final : public at::IdWrapper<TensorTypeId, details::_tensorTypeId_underlyingType> {
+class TensorTypeId final : public guts::IdWrapper<TensorTypeId, details::_tensorTypeId_underlyingType> {
 public:
   // Don't use this!
   // Unfortunately, a default constructor needs to be defined because of https://reviews.llvm.org/D41223
@@ -35,4 +35,4 @@ class TensorTypeId final : public at::IdWrapper<TensorTypeId, details::_tensorTy
 
 }  // namespace c10
 
-AT_DEFINE_HASH_FOR_IDWRAPPER(c10::TensorTypeId)
+C10_DEFINE_HASH_FOR_IDWRAPPER(c10::TensorTypeId)
diff --git a/caffe2/core/dispatch/TensorTypeIdRegistration.cpp b/caffe2/core/dispatch/TensorTypeIdRegistration.cpp
index 31b4c6b671aa29..9c7831b76a6dd0 100644
--- a/caffe2/core/dispatch/TensorTypeIdRegistration.cpp
+++ b/caffe2/core/dispatch/TensorTypeIdRegistration.cpp
@@ -1,5 +1,5 @@
 #include "caffe2/core/dispatch/TensorTypeIdRegistration.h"
-#include <ATen/core/C++17.h>
+#include "caffe2/utils/C++17.h"
 
 namespace c10 {
 
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
index 70490856b5ecaf..1e8156abe42172 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpClasses.h
@@ -659,3 +659,336 @@ class NHWC2NCHW : public NeuralNetOperator {
 
  private:
 };
+
+class Int8Quantize : public NeuralNetOperator {
+ public:
+  Int8Quantize() : NeuralNetOperator(NNKind::Int8Quantize) {}
+
+  ~Int8Quantize() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Quantize);
+
+ private:
+};
+
+class Int8Dequantize : public NeuralNetOperator {
+ public:
+  Int8Dequantize() : NeuralNetOperator(NNKind::Int8Dequantize) {}
+
+  ~Int8Dequantize() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Dequantize);
+
+ private:
+};
+
+class Int8AveragePool : public NeuralNetOperator {
+ public:
+  Int8AveragePool() : NeuralNetOperator(NNKind::Int8AveragePool) {}
+
+  Int8AveragePool(const AveragePool& averagePool)
+      : NeuralNetOperator(NNKind::Int8AveragePool) {}
+
+  ~Int8AveragePool() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8AveragePool);
+
+ private:
+};
+
+class Int8Conv : public NeuralNetOperator {
+ public:
+  Int8Conv() : NeuralNetOperator(NNKind::Int8Conv) {}
+
+  Int8Conv(const Conv& conv) : NeuralNetOperator(NNKind::Int8Conv) {}
+
+  ~Int8Conv() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Conv);
+
+ private:
+};
+
+class Int8ConvTranspose : public NeuralNetOperator {
+ public:
+  Int8ConvTranspose() : NeuralNetOperator(NNKind::Int8ConvTranspose) {}
+
+  Int8ConvTranspose(const ConvTranspose& convTranspose)
+      : NeuralNetOperator(NNKind::Int8ConvTranspose) {}
+
+  ~Int8ConvTranspose() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8ConvTranspose);
+
+ private:
+};
+
+class Int8FC : public NeuralNetOperator {
+ public:
+  Int8FC() : NeuralNetOperator(NNKind::Int8FC) {}
+
+  Int8FC(const FC& fC) : NeuralNetOperator(NNKind::Int8FC) {}
+
+  ~Int8FC() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8FC);
+
+ private:
+};
+
+class Int8MaxPool : public NeuralNetOperator {
+ public:
+  Int8MaxPool() : NeuralNetOperator(NNKind::Int8MaxPool) {}
+
+  Int8MaxPool(const MaxPool& maxPool)
+      : NeuralNetOperator(NNKind::Int8MaxPool) {}
+
+  ~Int8MaxPool() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8MaxPool);
+
+ private:
+};
+
+class Int8Relu : public NeuralNetOperator {
+ public:
+  Int8Relu() : NeuralNetOperator(NNKind::Int8Relu) {}
+
+  Int8Relu(const Relu& relu) : NeuralNetOperator(NNKind::Int8Relu) {}
+
+  ~Int8Relu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Relu);
+
+ private:
+};
+
+class Int8GivenTensorFill : public NeuralNetOperator {
+ public:
+  Int8GivenTensorFill() : NeuralNetOperator(NNKind::Int8GivenTensorFill) {}
+
+  Int8GivenTensorFill(const GivenTensorFill& givenTensorFill)
+      : NeuralNetOperator(NNKind::Int8GivenTensorFill) {}
+
+  ~Int8GivenTensorFill() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8GivenTensorFill);
+
+ private:
+};
+
+class Int8Concat : public NeuralNetOperator {
+ public:
+  Int8Concat() : NeuralNetOperator(NNKind::Int8Concat) {}
+
+  Int8Concat(const Concat& concat) : NeuralNetOperator(NNKind::Int8Concat) {}
+
+  ~Int8Concat() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Concat);
+
+ private:
+};
+
+class Int8Softmax : public NeuralNetOperator {
+ public:
+  Int8Softmax() : NeuralNetOperator(NNKind::Int8Softmax) {}
+
+  Int8Softmax(const Softmax& softmax)
+      : NeuralNetOperator(NNKind::Int8Softmax) {}
+
+  ~Int8Softmax() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Softmax);
+
+ private:
+};
+
+class Int8ChannelShuffle : public NeuralNetOperator {
+ public:
+  Int8ChannelShuffle() : NeuralNetOperator(NNKind::Int8ChannelShuffle) {}
+
+  Int8ChannelShuffle(const ChannelShuffle& channelShuffle)
+      : NeuralNetOperator(NNKind::Int8ChannelShuffle) {}
+
+  ~Int8ChannelShuffle() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8ChannelShuffle);
+
+ private:
+};
+
+class Int8Sum : public NeuralNetOperator {
+ public:
+  Int8Sum() : NeuralNetOperator(NNKind::Int8Sum) {}
+
+  Int8Sum(const Sum& sum) : NeuralNetOperator(NNKind::Int8Sum) {}
+
+  ~Int8Sum() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Sum);
+
+ private:
+};
+
+class Int8Add : public NeuralNetOperator {
+ public:
+  Int8Add() : NeuralNetOperator(NNKind::Int8Add) {}
+
+  Int8Add(const Add& add) : NeuralNetOperator(NNKind::Int8Add) {}
+
+  ~Int8Add() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Add);
+
+ private:
+};
+
+class Int8Reshape : public NeuralNetOperator {
+ public:
+  Int8Reshape() : NeuralNetOperator(NNKind::Int8Reshape) {}
+
+  Int8Reshape(const Reshape& reshape)
+      : NeuralNetOperator(NNKind::Int8Reshape) {}
+
+  ~Int8Reshape() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Reshape);
+
+ private:
+};
+
+class Int8Flatten : public NeuralNetOperator {
+ public:
+  Int8Flatten() : NeuralNetOperator(NNKind::Int8Flatten) {}
+
+  Int8Flatten(const Flatten& flatten)
+      : NeuralNetOperator(NNKind::Int8Flatten) {}
+
+  ~Int8Flatten() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8Flatten);
+
+ private:
+};
+
+class Int8ConvRelu : public NeuralNetOperator {
+ public:
+  Int8ConvRelu() : NeuralNetOperator(NNKind::Int8ConvRelu) {}
+
+  Int8ConvRelu(const ConvRelu& convRelu)
+      : NeuralNetOperator(NNKind::Int8ConvRelu) {}
+
+  ~Int8ConvRelu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8ConvRelu);
+
+ private:
+};
+
+class Int8SumRelu : public NeuralNetOperator {
+ public:
+  Int8SumRelu() : NeuralNetOperator(NNKind::Int8SumRelu) {}
+
+  Int8SumRelu(const SumRelu& sumRelu)
+      : NeuralNetOperator(NNKind::Int8SumRelu) {}
+
+  ~Int8SumRelu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8SumRelu);
+
+ private:
+};
+
+class Int8AveragePoolRelu : public NeuralNetOperator {
+ public:
+  Int8AveragePoolRelu() : NeuralNetOperator(NNKind::Int8AveragePoolRelu) {}
+
+  Int8AveragePoolRelu(const AveragePoolRelu& averagePoolRelu)
+      : NeuralNetOperator(NNKind::Int8AveragePoolRelu) {}
+
+  ~Int8AveragePoolRelu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8AveragePoolRelu);
+
+ private:
+};
+
+class Int8MaxPoolRelu : public NeuralNetOperator {
+ public:
+  Int8MaxPoolRelu() : NeuralNetOperator(NNKind::Int8MaxPoolRelu) {}
+
+  Int8MaxPoolRelu(const MaxPoolRelu& maxPoolRelu)
+      : NeuralNetOperator(NNKind::Int8MaxPoolRelu) {}
+
+  ~Int8MaxPoolRelu() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(Int8MaxPoolRelu);
+
+ private:
+};
+
+class BatchMatMul : public NeuralNetOperator {
+ public:
+  BatchMatMul(bool transA = false, bool transB = true, bool broadcast = false)
+      : NeuralNetOperator(NNKind::BatchMatMul),
+        TransA(transA),
+        TransB(transB),
+        Broadcast(broadcast) {}
+
+  ~BatchMatMul() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(BatchMatMul);
+
+  bool getTransA() const {
+    return TransA;
+  }
+
+  bool getTransB() const {
+    return TransB;
+  }
+
+  bool getBroadcast() const {
+    return Broadcast;
+  }
+
+  void setTransA(bool transA) {
+    TransA = transA;
+  }
+
+  void setTransB(bool transB) {
+    TransB = transB;
+  }
+
+  void setBroadcast(bool broadcast) {
+    Broadcast = broadcast;
+  }
+
+ private:
+  bool TransA;
+  bool TransB;
+  bool Broadcast;
+};
+
+class BatchGather : public NeuralNetOperator {
+ public:
+  BatchGather() : NeuralNetOperator(NNKind::BatchGather) {}
+
+  ~BatchGather() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(BatchGather);
+
+ private:
+};
+
+class ConcatBatchMatMulBatchGatherOp : public NeuralNetOperator {
+ public:
+  ConcatBatchMatMulBatchGatherOp()
+      : NeuralNetOperator(NNKind::ConcatBatchMatMulBatchGatherOp) {}
+
+  ~ConcatBatchMatMulBatchGatherOp() {}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI(ConcatBatchMatMulBatchGatherOp);
+
+ private:
+};
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
index 4d15dd40613403..9c4277293d0b41 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpEnum.h
@@ -1,4 +1,9 @@
 Relu, Conv, ConvRelu, ConvTranspose, AveragePool, AveragePoolRelu, MaxPool,
     MaxPoolRelu, Sum, SumRelu, Send, Receive, BatchNormalization, FC,
     GivenTensorFill, Concat, Softmax, ChannelShuffle, Add, Reshape, Flatten,
-    NCHW2NHWC, NHWC2NCHW
+    NCHW2NHWC, NHWC2NCHW, Int8Quantize, Int8Dequantize, Int8AveragePool,
+    Int8Conv, Int8ConvTranspose, Int8FC, Int8MaxPool, Int8Relu,
+    Int8GivenTensorFill, Int8Concat, Int8Softmax, Int8ChannelShuffle, Int8Sum,
+    Int8Add, Int8Reshape, Int8Flatten, Int8ConvRelu, Int8SumRelu,
+    Int8AveragePoolRelu, Int8MaxPoolRelu, BatchMatMul, BatchGather,
+    ConcatBatchMatMulBatchGatherOp
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
index 88ffa0b1ba6bb0..87ffda3c4f3436 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Generated/OpNames.h
@@ -1,68 +1,92 @@
 case NNKind::Relu:
   return "Relu";
-
 case NNKind::Conv:
   return "Conv";
-
 case NNKind::ConvRelu:
   return "ConvRelu";
-
 case NNKind::ConvTranspose:
   return "ConvTranspose";
-
 case NNKind::AveragePool:
   return "AveragePool";
-
 case NNKind::AveragePoolRelu:
   return "AveragePoolRelu";
-
 case NNKind::MaxPool:
   return "MaxPool";
-
 case NNKind::MaxPoolRelu:
   return "MaxPoolRelu";
-
 case NNKind::Sum:
   return "Sum";
-
 case NNKind::SumRelu:
   return "SumRelu";
-
 case NNKind::Send:
   return "Send";
-
 case NNKind::Receive:
   return "Receive";
-
 case NNKind::BatchNormalization:
   return "BatchNormalization";
-
 case NNKind::FC:
   return "FC";
-
 case NNKind::GivenTensorFill:
   return "GivenTensorFill";
-
 case NNKind::Concat:
   return "Concat";
-
 case NNKind::Softmax:
   return "Softmax";
-
 case NNKind::ChannelShuffle:
   return "ChannelShuffle";
-
 case NNKind::Add:
   return "Add";
-
 case NNKind::Reshape:
   return "Reshape";
-
 case NNKind::Flatten:
   return "Flatten";
-
 case NNKind::NCHW2NHWC:
   return "NCHW2NHWC";
-
 case NNKind::NHWC2NCHW:
   return "NHWC2NCHW";
+case NNKind::Int8Quantize:
+  return "Int8Quantize";
+case NNKind::Int8Dequantize:
+  return "Int8Dequantize";
+case NNKind::Int8AveragePool:
+  return "Int8AveragePool";
+case NNKind::Int8Conv:
+  return "Int8Conv";
+case NNKind::Int8ConvTranspose:
+  return "Int8ConvTranspose";
+case NNKind::Int8FC:
+  return "Int8FC";
+case NNKind::Int8MaxPool:
+  return "Int8MaxPool";
+case NNKind::Int8Relu:
+  return "Int8Relu";
+case NNKind::Int8GivenTensorFill:
+  return "Int8GivenTensorFill";
+case NNKind::Int8Concat:
+  return "Int8Concat";
+case NNKind::Int8Softmax:
+  return "Int8Softmax";
+case NNKind::Int8ChannelShuffle:
+  return "Int8ChannelShuffle";
+case NNKind::Int8Sum:
+  return "Int8Sum";
+case NNKind::Int8Add:
+  return "Int8Add";
+case NNKind::Int8Reshape:
+  return "Int8Reshape";
+case NNKind::Int8Flatten:
+  return "Int8Flatten";
+case NNKind::Int8ConvRelu:
+  return "Int8ConvRelu";
+case NNKind::Int8SumRelu:
+  return "Int8SumRelu";
+case NNKind::Int8AveragePoolRelu:
+  return "Int8AveragePoolRelu";
+case NNKind::Int8MaxPoolRelu:
+  return "Int8MaxPoolRelu";
+case NNKind::BatchMatMul:
+  return "BatchMatMul";
+case NNKind::BatchGather:
+  return "BatchGather";
+case NNKind::ConcatBatchMatMulBatchGatherOp:
+  return "ConcatBatchMatMulBatchGatherOp";
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
index aab127d8c56e16..3c5148e5b6c70f 100644
--- a/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Graph/Graph.h
@@ -46,31 +46,28 @@ class Edge : public StorageType<U...> {
  public:
   using NodeRef = typename Graph<T, U...>::NodeRef;
   Edge(NodeRef tail, NodeRef head, U... args)
-      : StorageType<U...>(std::forward<U...>(args)...),
-        tail_(tail),
-        head_(head) {
+      : StorageType<U...>(std::forward<U...>(args)...), Tail(tail), Head(head) {
     DEBUG_PRINT("Creating instance of Edge: %p\n", this);
   }
 
   const NodeRef& tail() const {
-    return tail_;
+    return Tail;
   }
   const NodeRef& head() const {
-    return head_;
+    return Head;
   }
 
   void setTail(NodeRef n) {
-    tail_ = n;
+    Tail = n;
   }
 
   void setHead(NodeRef n) {
-    head_ = n;
+    Head = n;
   }
 
  private:
-  NodeRef tail_;
-  NodeRef head_;
-
+  NodeRef Tail;
+  NodeRef Head;
   friend class Graph<T, U...>;
 };
 
@@ -91,55 +88,54 @@ class Node : public StorageType<T>, public Notifier<Node<T, U...>> {
   /// \brief Adds an edge by reference to known in-edges.
   /// \p e A reference to an edge that will be added as an in-edge.
   void addInEdge(EdgeRef e) {
-    inEdges_.emplace_back(e);
+    inEdges.emplace_back(e);
   }
 
   /// \brief Adds an edge by reference to known out-edges.
   /// \p e A reference to an edge that will be added as an out-edge.
   void addOutEdge(EdgeRef e) {
-    outEdges_.emplace_back(e);
+    outEdges.emplace_back(e);
   }
 
   /// \brief Removes an edge by reference to known in-edges.
   /// \p e A reference to an edge that will be removed from in-edges.
   void removeInEdge(EdgeRef e) {
-    removeEdgeInternal(inEdges_, e);
+    auto iter = std::find(inEdges.begin(), inEdges.end(), e);
+    assert(
+        iter != inEdges.end() &&
+        "Attempted to remove edge that isn't connected to this node");
+    inEdges.erase(iter);
   }
 
   /// \brief Removes an edge by reference to known out-edges.
   /// \p e A reference to an edge that will be removed from out-edges.
   void removeOutEdge(EdgeRef e) {
-    removeEdgeInternal(outEdges_, e);
+    auto iter = std::find(outEdges.begin(), outEdges.end(), e);
+    assert(
+        iter != outEdges.end() &&
+        "Attempted to remove edge that isn't connected to this node");
+    outEdges.erase(iter);
   }
 
   const std::vector<EdgeRef>& getOutEdges() const {
-    return outEdges_;
+    return outEdges;
   }
   const std::vector<EdgeRef>& getInEdges() const {
-    return inEdges_;
+    return inEdges;
   }
 
-  void setInEdges(std::vector<EdgeRef> edges) {
-    inEdges_ = edges;
+  void setInEdges(std::vector<EdgeRef> es) {
+    inEdges = es;
   }
 
-  void setOutEdges(std::vector<EdgeRef> edges) {
-    outEdges_ = edges;
+  void setOutEdges(std::vector<EdgeRef> es) {
+    outEdges = es;
   }
 
- private:
-  std::vector<EdgeRef> inEdges_;
-  std::vector<EdgeRef> outEdges_;
-
+ protected:
+  std::vector<EdgeRef> inEdges;
+  std::vector<EdgeRef> outEdges;
   friend class Graph<T, U...>;
-
-  void removeEdgeInternal(std::vector<EdgeRef>& edges, EdgeRef e) {
-    auto iter = std::find(edges.begin(), edges.end(), e);
-    assert(
-        iter != edges.end() &&
-        "Attempted to remove edge that isn't connected to this node");
-    edges.erase(iter);
-  }
 };
 
 /// \brief Effectively a constant reference to a graph.
@@ -162,56 +158,46 @@ class Subgraph {
   using EdgeRef = typename Graph<T, U...>::EdgeRef;
 
   void addNode(NodeRef n) {
-    nodes_.insert(n);
+    Nodes.insert(n);
   }
-
   bool hasNode(NodeRef n) const {
-    return nodes_.count(n) != 0;
+    return Nodes.count(n) != 0;
   }
-
   void removeNode(NodeRef n) {
-    nodes_.erase(n);
+    Nodes.erase(n);
   }
 
   void addEdge(EdgeRef e) {
-    edges_.insert(e);
+    Edges.insert(e);
   }
-
-  bool hasEdge(EdgeRef e) const {
-    return edges_.count(e) != 0;
+  bool hasEdge(EdgeRef n) const {
+    return Edges.count(n) != 0;
   }
-
   void removeEdge(EdgeRef e) {
-    edges_.erase(e);
+    Edges.erase(e);
   }
 
   const std::unordered_set<NodeRef>& getNodes() const {
-    return nodes_;
-  }
-
-  const size_t getNodesCount() const {
-    return (size_t)nodes_.size();
+    return Nodes;
   }
-
   const std::unordered_set<EdgeRef>& getEdges() const {
-    return edges_;
+    return Edges;
   }
 
- private:
-  std::unordered_set<NodeRef> nodes_;
-  std::unordered_set<EdgeRef> edges_;
-
   void printEdges() {
-    for (const auto& edge : edges_) {
+    for (const auto& edge : Edges) {
       printf("Edge: %p (%p -> %p)\n", &edge, edge->tail(), edge->head());
     }
   }
 
   void printNodes() const {
-    for (const auto& node : nodes_) {
+    for (const auto& node : Nodes) {
       printf("Node: %p\n", node);
     }
   }
+
+  std::unordered_set<NodeRef> Nodes;
+  std::unordered_set<EdgeRef> Edges;
 };
 
 /// \brief A simple graph implementation
@@ -245,21 +231,21 @@ class Graph {
   }
 
   void importNode(NodeRef node, Graph<T, U...>& otherGraph) {
-    for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
+    for (auto it = Nodes.begin(); it != Nodes.end(); ++it) {
       if (&(*it) == node) {
-        std::list<Node<T, U...>>& otherNodes = otherGraph.nodes_;
-        otherNodes.splice(otherNodes.end(), nodes_, it, ++it);
-        otherGraph.nodeRefs_.insert(node);
+        std::list<Node<T, U...>>& otherNodes = otherGraph.Nodes;
+        otherNodes.splice(otherNodes.end(), Nodes, it, ++it);
+        otherGraph.NodeRefs.insert(node);
         break;
       }
     }
   }
 
   void importEdge(EdgeRef edge, Graph<T, U...>& otherGraph) {
-    std::list<Edge<T, U...>>& otherEdges = otherGraph.edges_;
-    for (auto it = edges_.begin(); it != edges_.end(); ++it) {
+    std::list<Edge<T, U...>>& otherEdges = otherGraph.Edges;
+    for (auto it = Edges.begin(); it != Edges.end(); ++it) {
       if (&(*it) == edge) {
-        otherEdges.splice(otherEdges.end(), edges_, it, ++it);
+        otherEdges.splice(otherEdges.end(), Edges, it, ++it);
         break;
       }
     }
@@ -327,9 +313,9 @@ class Graph {
   /// \return A reference to the edge created.
   EdgeRef createEdge(NodeRef tail, NodeRef head, U... data) {
     DEBUG_PRINT("Creating edge (%p -> %p)\n", tail, head);
-    this->edges_.emplace_back(
+    this->Edges.emplace_back(
         Edge<T, U...>(tail, head, std::forward<U...>(data)...));
-    EdgeRef e = &this->edges_.back();
+    EdgeRef e = &this->Edges.back();
     head->addInEdge(e);
     tail->addOutEdge(e);
     return e;
@@ -353,85 +339,85 @@ class Graph {
   /// related to the node.
   void deleteNode(NodeRef n, bool deleteEdges = true) {
     if (deleteEdges) {
-      auto inEdges = n->inEdges_;
+      auto inEdges = n->inEdges;
       for (auto& edge : inEdges) {
         deleteEdge(edge);
       }
-      auto outEdges = n->outEdges_;
+      auto outEdges = n->outEdges;
       for (auto& edge : outEdges) {
         deleteEdge(edge);
       }
     }
-    for (auto i = nodes_.begin(); i != nodes_.end(); ++i) {
+    for (auto i = Nodes.begin(); i != Nodes.end(); ++i) {
       if (&*i == n) {
-        nodeRefs_.erase(n);
-        nodes_.erase(i);
+        NodeRefs.erase(n);
+        Nodes.erase(i);
         break;
       }
     }
   }
 
-  bool hasNode(NodeRef node) const {
-    return nodeRefs_.find(node) != nodeRefs_.end();
+  bool hasNode(NodeRef ref) const {
+    return NodeRefs.find(ref) != NodeRefs.end();
   }
 
   /// \brief Deletes a edge from the graph.
   /// \p e A reference to the edge.
-  void deleteEdge(EdgeRef e, bool removeRef = true) {
-    if (removeRef) {
-      e->tail_->removeOutEdge(e);
-      e->head_->removeInEdge(e);
+  void deleteEdge(EdgeRef e, bool remove_ref = true) {
+    if (remove_ref) {
+      e->Tail->removeOutEdge(e);
+      e->Head->removeInEdge(e);
     }
-    for (auto i = edges_.begin(); i != edges_.end(); ++i) {
+    for (auto i = Edges.begin(); i != Edges.end(); ++i) {
       if (&*i == e) {
-        edges_.erase(i);
+        Edges.erase(i);
         break;
       }
     }
   }
 
   const std::vector<NodeRef> getMutableNodes() {
-    std::vector<NodeRef> result;
-    for (auto& n : nodes_) {
+    std::vector<NodeRef> v;
+    for (auto& n : Nodes) {
       DEBUG_PRINT("Adding node to mutable output (%p)\n", &n);
-      result.emplace_back(&n);
+      v.emplace_back(&n);
     }
-    return result;
+    return v;
   }
 
   const std::vector<EdgeRef> getMutableEdges() {
-    std::vector<EdgeRef> result;
-    for (auto& e : edges_) {
+    std::vector<EdgeRef> v;
+    for (auto& e : Edges) {
       DEBUG_PRINT("Adding edge to mutable output (%p)\n", &e);
-      result.emplace_back(&e);
+      v.emplace_back(&e);
     }
-    return result;
-  }
-
- private:
-  std::list<Node<T, U...>> nodes_;
-  std::list<Edge<T, U...>> edges_;
-  std::unordered_set<NodeRef> nodeRefs_;
-
-  NodeRef createNodeInternal(Node<T, U...>&& node) {
-    nodes_.emplace_back(std::move(node));
-    NodeRef nodeRef = &nodes_.back();
-    DEBUG_PRINT("Creating node (%p)\n", nodeRef);
-    nodeRefs_.insert(nodeRef);
-    return nodeRef;
+    return v;
   }
 
   void printEdges() {
-    for (const auto& edge : edges_) {
+    for (const auto& edge : Edges) {
       printf("Edge: %p (%p -> %p)\n", &edge, edge.tail(), edge.head());
     }
   }
 
   void printNodes() const {
-    for (const auto& node : nodes_) {
+    for (const auto& node : Nodes) {
       printf("Node: %p\n", &node);
     }
   }
+
+ private:
+  std::list<Node<T, U...>> Nodes;
+  std::list<Edge<T, U...>> Edges;
+  std::unordered_set<NodeRef> NodeRefs;
+
+  NodeRef createNodeInternal(Node<T, U...>&& node) {
+    Nodes.emplace_back(std::move(node));
+    NodeRef nodeRef = &Nodes.back();
+    DEBUG_PRINT("Creating node (%p)\n", nodeRef);
+    NodeRefs.insert(nodeRef);
+    return nodeRef;
+  }
 };
 
 } // namespace nom
diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
deleted file mode 100644
index 08ead742950740..00000000000000
--- a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h
+++ /dev/null
@@ -1,174 +0,0 @@
-#ifndef NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H
-#define NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H
-
-namespace nom {
-
-namespace matcher {
-
-/*
- * Subtree matching criteria consists of
- * - Node matching criteria for the subtree's root.
- * - Children subtree matching criteria
- * - A count, which means we may want more than one of this subtree. The count
- * can be unlimited. The count is only used when we match children of a
- * subtree root, not matching the subtree itself.
- */
-template <typename NodeMatchCriteria>
-class SubtreeMatchCriteria {
- public:
-  static const int kStarCount = -1;
-  SubtreeMatchCriteria(
-      const NodeMatchCriteria& root,
-      const std::vector<SubtreeMatchCriteria>& children,
-      int count)
-      : root_(root), children_(children), count_(count){};
-
- private:
-  NodeMatchCriteria root_;
-  std::vector<SubtreeMatchCriteria> children_;
-  int count_;
-
-  template <typename, typename, typename>
-  friend class SubgraphMatcher;
-};
-
-/*
- * Utilities for subgraph matching.
- */
-template <
-    typename GraphType,
-    typename NodeMatchCriteria,
-    typename NodeMatcherClass>
-struct SubgraphMatcher {
-  static bool isNodeMatch(
-      typename GraphType::NodeRef node,
-      const NodeMatchCriteria& criteria) {
-    return NodeMatcherClass::isMatch(node, criteria);
-  }
-
-  // Check if there can be a sub-tree that matches the given criteria that
-  // is rooted at the given rootNode.
-  // The flag invertGraphTraversal specify if we should follow out edges or
-  // in edges. The default is true which is useful for a functional
-  // intepretation of a dataflow graph.
-  static bool isSubtreeMatch(
-      typename GraphType::NodeRef root,
-      const SubtreeMatchCriteria<NodeMatchCriteria>& criteria,
-      bool invertGraphTraversal = true) {
-    if (!isNodeMatch(root, criteria.root_)) {
-      return false;
-    }
-    auto& edges =
-        invertGraphTraversal ? root->getInEdges() : root->getOutEdges();
-
-    int numEdges = edges.size();
-    int numChildrenCriteria = criteria.children_.size();
-
-    // The current algorithm implies that the ordering of the children is
-    // important. The children nodes will be matched with the children subtree
-    // criteria in the given order.
-
-    int currentEdgeIdx = 0;
-    for (int criteriaIdx = 0; criteriaIdx < numChildrenCriteria;
-         criteriaIdx++) {
-      auto childrenCriteria = criteria.children_[criteriaIdx];
-
-      int expectedCount = childrenCriteria.count_;
-      bool isStarCount =
-          expectedCount == SubtreeMatchCriteria<NodeMatchCriteria>::kStarCount;
-
-      int countMatch = 0;
-
-      // Continue to match subsequent edges with the current children criteria.
-      // Note that if the child criteria is a * pattern, this greedy algorithm
-      // will attempt to find the longest possible sequence that matches the
-      // children criteria.
-      for (; currentEdgeIdx < numEdges &&
-           (isStarCount || countMatch < expectedCount);
-           currentEdgeIdx++) {
-        auto edge = edges[currentEdgeIdx];
-        auto nextNode = invertGraphTraversal ? edge->tail() : edge->head();
-
-        if (!isSubtreeMatch(nextNode, childrenCriteria, invertGraphTraversal)) {
-          if (!isStarCount) {
-            // If the current criteria isn't a * pattern, this indicates a
-            // failure.
-            return false;
-          } else {
-            // Otherwise, we should move on to the next children criteria.
-            break;
-          }
-        }
-
-        countMatch++;
-      }
-
-      if (countMatch < expectedCount) {
-        // Fails because there are not enough matches as specified by the
-        // criteria.
-        return false;
-      }
-    }
-
-    if (currentEdgeIdx < numEdges) {
-      // Fails because there are unmatched edges.
-      return false;
-    }
-    return true;
-  }
-
-  // Utility to transform a graph by looking for subtrees that match
-  // a given pattern and then allow callers to mutate the graph based on
-  // subtrees that are found.
-  // The current implementation doesn't handle any graph transformation
-  // itself. Callers should be responsible for all intended mutation, including
-  // deleting nodes in the subtrees found by this algorithm.
-  // Note: if the replaceFunction lambda returns false, the entire procedure
-  // is aborted. This maybe useful in certain cases when we want to terminate
-  // the subtree search early.
-  // invertGraphTraversal flag: see documentation in isSubtreeMatch
-  static void replaceSubtree(
-      GraphType& graph,
-      const SubtreeMatchCriteria<NodeMatchCriteria>& criteria,
-      const std::function<
-          bool(GraphType& g, typename GraphType::NodeRef subtreeRoot)>&
-          replaceFunction,
-      bool invertGraphTraversal = true) {
-    for (auto nodeRef : graph.getMutableNodes()) {
-      // Make sure the node is still in the graph.
-      if (!graph.hasNode(nodeRef)) {
-        continue;
-      }
-      if (isSubtreeMatch(nodeRef, criteria, invertGraphTraversal)) {
-        if (!replaceFunction(graph, nodeRef)) {
-          // If replaceFunction returns false, it means that we should abort
-          // the entire procedure.
-          break;
-        }
-      }
-    }
-  }
-};
-
-// Convenient methods to create subtree matching criteria.
-template <typename NodeMatchCriteria>
-SubtreeMatchCriteria<NodeMatchCriteria> tree(
-    const NodeMatchCriteria& root,
-    const std::vector<SubtreeMatchCriteria<NodeMatchCriteria>>& children = {},
-    int count = 1) {
-  return SubtreeMatchCriteria<NodeMatchCriteria>(root, children, count);
-}
-
-template <typename NodeMatchCriteria>
-SubtreeMatchCriteria<NodeMatchCriteria> treeStar(
-    const NodeMatchCriteria& root,
-    const std::vector<SubtreeMatchCriteria<NodeMatchCriteria>>& children = {}) {
-  return tree(
-      root, children, SubtreeMatchCriteria<NodeMatchCriteria>::kStarCount);
-}
-
-} // namespace matcher
-
-} // namespace nom
-
-#endif // NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H
diff --git a/caffe2/core/nomnigraph/op_gen.py b/caffe2/core/nomnigraph/op_gen.py
index 2d1125f5762ad4..c62148ea52cff5 100755
--- a/caffe2/core/nomnigraph/op_gen.py
+++ b/caffe2/core/nomnigraph/op_gen.py
@@ -6,8 +6,6 @@
 from __future__ import unicode_literals
 
 import argparse
-from textwrap import dedent
-from subprocess import call
 
 
 def parse_lines(lines):
@@ -24,27 +22,25 @@ def parse_lines(lines):
     index = 0
     while index < len(lines):
         line = lines[index]
-        if line.lower().startswith("macro"):
-            assert parse_state == EMPTY
-            macro_line = line.split(" ")
+        if line.lower().startswith('macro'):
+            assert (parse_state == EMPTY)
+            macro_line = line.split(' ')
             # Support macros that look like attributes
             # e.g. macro - CONV_LIKE
-            curr_macro = " ".join(macro_line[1:])
-            assert curr_macro not in macros, 'Macro "{}" defined twice.'.format(
-                curr_macro
-            )
+            curr_macro = ' '.join(macro_line[1:])
+            assert (curr_macro not in macros)
             macros[curr_macro] = []
             parse_state = MACRO
-            lines = lines[:index] + lines[index + 1 :]
+            lines = lines[:index] + lines[index + 1:]
             continue
-        elif line.lower().startswith("endmacro"):
-            assert parse_state == MACRO
+        elif line.lower().startswith('endmacro'):
+            assert (parse_state == MACRO)
             parse_state = EMPTY
-            lines = lines[:index] + lines[index + 1 :]
+            lines = lines[:index] + lines[index + 1:]
             continue
         elif parse_state == MACRO:
             macros[curr_macro].append(line)
-            lines = lines[:index] + lines[index + 1 :]
+            lines = lines[:index] + lines[index + 1:]
             continue
         index += 1
 
@@ -52,7 +48,7 @@ def parse_lines(lines):
     while index < len(lines):
         line = lines[index]
         if line in macros:
-            lines = lines[:index] + macros[line] + lines[index + 1 :]
+            lines = lines[:index] + macros[line] + lines[index + 1:]
             index += len(macros[line]) - 1
         index += 1
 
@@ -67,20 +63,20 @@ def parse_lines(lines):
     for line in lines:
         if not len(line):
             continue
-        if line[0] == "-":
-            assert parse_state is OP
-            attr = [_.strip() for _ in line[1:].split(":")]
-            assert attr[0][0].isupper()
-            if len(attr) == 2:  # attribute : type
+        if line[0] == '-':
+            assert (parse_state is OP)
+            attr = [_.strip() for _ in line[1:].split(':')]
+            assert (attr[0][0].isupper())
+            if (len(attr) == 2):  # attribute : type
                 ops[curr_op]["attributes"].append((attr[0], attr[1]))
-            elif len(attr) == 3:  # attribute : type
+            elif (len(attr) == 3):  # attribute : type
                 ops[curr_op]["attributes"].append((attr[0], attr[1], attr[2]))
         else:
-            op = [l.strip() for l in line.split(":")]
-            assert len(op[0].split(" ")) == 1
+            op = [l.strip() for l in line.split(':')]
+            assert (len(op[0].split(' ')) == 1)
             parse_state = OP
             curr_op = op[0]
-            assert curr_op not in ops
+            assert (curr_op not in ops)
             ops[curr_op] = {}
             op_list.append(curr_op)
             if len(op) > 1:
@@ -105,26 +101,20 @@ def gen_class(op, op_def):
         attr_arg = "{type} {lower_name}".format(
             type=t, lower_name=lower_name + default_arg
         )
-        attr_init = "{name}({lower_name})".format(name=name, lower_name=lower_name)
-        attr_declare = "{type} {name};".format(type=t, name=name)
-        attr_get = dedent(
-            """
-              {type} get{name}() const {{
-                return {name};
-              }}
-            """.format(
-                type=t, name=name
-            )
-        )
-        attr_set = dedent(
-            """
-              void set{name}({type} {lower_name}) {{
-                {name} = {lower_name};
-              }}
-            """.format(
-                type=t, name=name, lower_name=lower_name
-            )
+        attr_init = "{name}({lower_name})".format(
+            name=name, lower_name=lower_name
         )
+        attr_declare = "{type} {name};".format(type=t, name=name)
+        attr_get = """
+  {type} get{name}() const {{
+    return {name};
+  }}
+""".format(type=t, name=name)
+        attr_set = """
+  void set{name}({type} {lower_name}) {{
+    {name} = {lower_name};
+  }}
+""".format(type=t, name=name, lower_name=lower_name)
         attribute_args.append(attr_arg)
         attribute_init.append(attr_init)
         attribute_declarations.append(attr_declare)
@@ -142,43 +132,38 @@ def gen_class(op, op_def):
                         name=attr[0], other_op=lower_other_op
                     )
                 )
-            init = dedent(
-                """
-                  {op}(const {other_op}& {lower_other_op}) :
-                      {other_init} {{}}
-                """.format(
-                    op=op,
-                    other_op=other_op,
-                    lower_other_op=lower_other_op,
-                    other_init=",\n      ".join(other_init),
-                )
+            init = """
+  {op}(const {other_op}& {lower_other_op}) :
+      {other_init} {{}}
+""".format(
+                op=op,
+                other_op=other_op,
+                lower_other_op=lower_other_op,
+                other_init=',\n      '.join(other_init)
             )
             extra_init += init
 
-    return dedent(
-        """
-        class {op} : public NeuralNetOperator {{
-         public:
-          {op}({attribute_args}) :
-              {attribute_init} {{}}
-          {extra_init}
-          ~{op}() {{}}
-
-          NOMNIGRAPH_DEFINE_NN_RTTI({op});
-        {getters}{setters}
-         private:
-          {attribute_declarations}
-        }};
-
-        """.format(
-            op=op,
-            extra_init=extra_init,
-            getters="".join(attribute_getters),
-            setters="".join(attribute_setters),
-            attribute_args=",\n".join(attribute_args),
-            attribute_init=",\n".join(attribute_init),
-            attribute_declarations="\n".join(attribute_declarations),
-        )
+    return """class {op} : public NeuralNetOperator {{
+ public:
+  {op}({attribute_args}) :
+      {attribute_init} {{}}
+  {extra_init}
+  ~{op}() {{}}
+
+  NOMNIGRAPH_DEFINE_NN_RTTI({op});
+{getters}{setters}
+ private:
+  {attribute_declarations}
+}};
+
+""".format(
+        op=op,
+        extra_init=extra_init,
+        getters=''.join(attribute_getters),
+        setters=''.join(attribute_setters),
+        attribute_args=',\n    '.join(attribute_args),
+        attribute_init=',\n      '.join(attribute_init),
+        attribute_declarations='\n  '.join(attribute_declarations)
     )
 
 
@@ -190,51 +175,33 @@ def gen_classes(ops, op_list):
 
 
 def gen_enum(op_list):
-    return ",\n".join([op for op in op_list]) + "\n"
+    return ',\n'.join([op for op in op_list]) + '\n'
 
 
 def gen_names(op_list):
     f = ""
     for op in op_list:
-        f += dedent(
-            """
-            case NNKind::{name}:
-                return \"{name}\";
-            """.format(
-                name=op
-            )
-        )
+        f += """case NNKind::{name}:
+    return \"{name}\";
+""".format(name=op)
     return f
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Generate op files.")
-    parser.add_argument("--install_dir", help="installation directory")
-    parser.add_argument("--source_def", help="ops.def", action="append")
+    parser = argparse.ArgumentParser(description='Generate op files.')
+    parser.add_argument('--install_dir', help='installation directory')
+    parser.add_argument('--source_def', help='ops.def')
     args = parser.parse_args()
     install_dir = args.install_dir
-    sources = args.source_def
 
-    lines = []
-    for source in sources:
-        with open(source, "rb") as f:
-            lines_tmp = f.readlines()
-            lines += [l.strip().decode("utf-8") for l in lines_tmp]
+    with open(args.source_def, 'rb') as f:
+        lines = f.readlines()
+        lines = [l.strip().decode("utf-8") for l in lines]
     ops, op_list = parse_lines(lines)
 
-    with open(install_dir + "/OpClasses.h", "wb") as f:
+    with open(install_dir + '/OpClasses.h', 'wb') as f:
         f.write(gen_classes(ops, op_list).encode("utf-8"))
-    with open(install_dir + "/OpNames.h", "wb") as f:
+    with open(install_dir + '/OpNames.h', 'wb') as f:
         f.write(gen_names(op_list).encode("utf-8"))
-    with open(install_dir + "/OpEnum.h", "wb") as f:
+    with open(install_dir + '/OpEnum.h', 'wb') as f:
         f.write(gen_enum(op_list).encode("utf-8"))
-
-    try:
-        cmd = ["clang-format", "-i", install_dir + "/OpClasses.h"]
-        call(cmd)
-        cmd = ["clang-format", "-i", install_dir + "/OpNames.h"]
-        call(cmd)
-        cmd = ["clang-format", "-i", install_dir + "/OpEnum.h"]
-        call(cmd)
-    except Exception:
-        pass
diff --git a/caffe2/core/nomnigraph/ops.def b/caffe2/core/nomnigraph/ops.def
index 6183e3c25726a3..53dd951c8fc1c2 100644
--- a/caffe2/core/nomnigraph/ops.def
+++ b/caffe2/core/nomnigraph/ops.def
@@ -69,3 +69,30 @@ CopyFromOpenCL
 NCHW2NHWC
 NHWC2NCHW
 
+Int8Quantize
+Int8Dequantize
+Int8AveragePool : AveragePool
+Int8Conv : Conv
+Int8ConvTranspose : ConvTranspose
+Int8FC : FC
+Int8MaxPool : MaxPool
+Int8Relu : Relu
+Int8GivenTensorFill : GivenTensorFill
+Int8Concat : Concat
+Int8Softmax : Softmax
+Int8ChannelShuffle : ChannelShuffle
+Int8Sum : Sum
+Int8Add : Add
+Int8Reshape : Reshape
+Int8Flatten : Flatten
+Int8ConvRelu : ConvRelu
+Int8SumRelu : SumRelu
+Int8AveragePoolRelu : AveragePoolRelu
+Int8MaxPoolRelu : MaxPoolRelu
+
+BatchMatMul
+- TransA : bool : false
+- TransB : bool : true
+- Broadcast: bool : false
+BatchGather
+ConcatBatchMatMulBatchGatherOp
diff --git a/caffe2/core/nomnigraph/tests/binary_match_test.cc b/caffe2/core/nomnigraph/tests/binary_match_test.cc
index ca3fd11b3a9126..4834cea30f3e23 100644
--- a/caffe2/core/nomnigraph/tests/binary_match_test.cc
+++ b/caffe2/core/nomnigraph/tests/binary_match_test.cc
@@ -19,7 +19,7 @@ TEST(BinaryMatch, AllMatch) {
   auto matches = nom::algorithm::binaryMatch(
       &graph, [](decltype(graph)::NodeRef n) { return true; });
   EXPECT_EQ(matches.size(), 1);
-  EXPECT_EQ(matches.front().getNodesCount(), graph.getMutableNodes().size());
+  EXPECT_EQ(matches.front().Nodes.size(), graph.getMutableNodes().size());
 }
 
 TEST(BinaryMatch, EmptyGraph) {
@@ -58,9 +58,9 @@ TEST(BinaryMatch, Basic) {
 
   EXPECT_EQ(matches.size(), 1);
   auto match = matches.front();
-  EXPECT_EQ(match.getNodesCount(), 4);
+  EXPECT_EQ(match.Nodes.size(), 4);
   std::set<std::string> exp{"2", "3", "4", "6"};
-  for (auto n : match.getNodes()) {
+  for (auto n : match.Nodes) {
     EXPECT_EQ(exp.count(n->data()), 1);
     exp.erase(n->data());
   }
@@ -104,16 +104,16 @@ TEST(BinaryMatch, RemovedMiddleNode) {
   auto match1 = matches.front();
   auto match2 = matches.back();
 
-  EXPECT_EQ(match1.getNodesCount(), 2);
-  EXPECT_EQ(match2.getNodesCount(), 1);
+  EXPECT_EQ(match1.Nodes.size(), 2);
+  EXPECT_EQ(match2.Nodes.size(), 1);
 
   std::set<std::string> exp1{"2", "4"};
   std::set<std::string> exp2{"6"};
-  for (auto n : match1.getNodes()) {
+  for (auto n : match1.Nodes) {
     EXPECT_EQ(exp1.count(n->data()), 1);
     exp1.erase(n->data());
   }
-  for (auto n : match2.getNodes()) {
+  for (auto n : match2.Nodes) {
     EXPECT_EQ(exp2.count(n->data()), 1);
     exp2.erase(n->data());
   }
diff --git a/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc b/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc
deleted file mode 100644
index ddd8a15fcdc2bc..00000000000000
--- a/caffe2/core/nomnigraph/tests/subgraph_matcher_test.cc
+++ /dev/null
@@ -1,404 +0,0 @@
-#include <algorithm>
-
-#include "test_util.h"
-
-#include "nomnigraph/Transformations/SubgraphMatcher.h"
-
-#include <gtest/gtest.h>
-
-namespace nom {
-
-namespace matcher {
-
-using NodeType = std::string;
-using Criteria = std::string;
-
-// Node matches a criteria (string) if the data string is the same as the
-// criteria. Special case: "*" will match any thing.
-struct TestNodeMatch {
-  static bool isMatch(
-      const nom::Graph<NodeType>::NodeRef& node,
-      const Criteria& criteria) {
-    return criteria == "*" || criteria == node->data();
-  }
-};
-
-using TestGraph = Graph<NodeType>;
-using TestMatcher = SubgraphMatcher<TestGraph, Criteria, TestNodeMatch>;
-
-Criteria any() {
-  return Criteria("*");
-}
-
-// Make it more concise to create matching criteria in dataflow graph.
-// For example, operatorTree("opA", ...) will refer to a tree like this:
-// ... -> opA -> opA_Output
-SubtreeMatchCriteria<Criteria> operatorTree(
-    const Criteria& root,
-    const std::vector<SubtreeMatchCriteria<Criteria>>& childrenCriteria = {},
-    int count = 1) {
-  return tree(any(), {tree(root, childrenCriteria)}, count);
-}
-
-std::map<std::string, std::string> TestGraphNodePrinter(
-    TestGraph::NodeRef node) {
-  std::map<std::string, std::string> labelMap;
-  labelMap["label"] = node->data();
-  return labelMap;
-};
-
-// Attempts to create a realistic dataflow graph that shows a fuse procedure.
-struct DataFlowTestGraph {
-  const int numInputs = 4;
-
-  TestGraph graph;
-
-  TestGraph::NodeRef opB;
-  TestGraph::NodeRef opF;
-  TestGraph::NodeRef opC;
-  TestGraph::NodeRef opG;
-  TestGraph::NodeRef dataOut;
-
-  // Realistic data flow test graph.
-  /*
-
-
-                          +---------------+
-                          |               |
-                          |  +---------+  |  +---------+
-    +---------------------+  | input_A |  |  | input_B |
-    |                        +---------+  |  +---------+
-    |                          |          |    |
-    |                          |          |    |
-    |                          v          v    v
-  +---------++---------+     +-------------------------+     +--------+
-  | input_C || input_D | --> |           opC           | --> | dataC2 |
-  +---------++---------+     +-------------------------+     +--------+
-                               |
-                               |
-                               v
-                             +---------+
-                             |  dataC  | -+
-                             +---------+  |
-                               |          |
-                               |          |
-                               v          |
-                             +---------+  |
-                             |   opB   | <+
-                             +---------+
-                               |
-                               |
-                               v
-                             +---------+
-                             |  dataB  |
-                             +---------+
-                               |
-                               |
-                               v
-                             +---------+
-                             |   opF   |
-                             +---------+
-                               |
-                               |
-                               v
-                             +---------+
-                             |  dataF  |
-                             +---------+
-                               |
-                               |
-                               v
-             +---------+     +---------+
-             |  dataI  | --> |   opG   |
-             +---------+     +---------+
-                               |
-                               |
-                               v
-                             +---------+
-                             | dataOut |
-                             +---------+
-  */
-  DataFlowTestGraph() {
-    opC = graph.createNode("opC");
-
-    for (int i = 0; i < numInputs; i++) {
-      auto dataInput = graph.createNode("input");
-      graph.createEdge(dataInput, opC);
-    }
-
-    auto dataC = graph.createNode("dataC");
-    auto dataC2 = graph.createNode("dataC2");
-    graph.createEdge(opC, dataC);
-    graph.createEdge(opC, dataC2);
-
-    opB = graph.createNode("opB");
-    // There are 2 edges
-    graph.createEdge(dataC, opB);
-    graph.createEdge(dataC, opB);
-
-    auto dataB = graph.createNode("dataB");
-    graph.createEdge(opB, dataB);
-
-    opF = graph.createNode("opF");
-    graph.createEdge(dataB, opF);
-
-    auto dataF = graph.createNode("dataF");
-    graph.createEdge(opF, dataF);
-
-    auto dataI = graph.createNode("dataI");
-
-    opG = graph.createNode("opG");
-    graph.createEdge(dataF, opG);
-    graph.createEdge(dataI, opG);
-
-    dataOut = graph.createNode("dataOut");
-    graph.createEdge(opG, dataOut);
-
-    // Use nom::converters::convertToDotString(&graph, TestGraphNodePrinter)
-    // to visualize the graph.
-  }
-};
-
-SubtreeMatchCriteria<Criteria> DataFlowTestGraphCriteria() {
-  // clang-format off
-  return tree(
-      Criteria("opG"),{
-        operatorTree("opF", {
-            // Note: we currently don't enforce that these 2 opC nodes
-            // have to be the same.
-            operatorTree("opB", {
-              operatorTree("opC", {
-                treeStar(Criteria("input"))
-              }, 2),
-            })
-        }),
-        tree(any()) // matches dataI
-      });
-  // clang-format on
-}
-
-TestGraph::NodeRef getInNode(TestGraph::NodeRef node, int index) {
-  return node->getInEdges()[index]->tail();
-}
-
-} // namespace matcher
-
-} // namespace nom
-
-using namespace nom::matcher;
-
-// Simple test cases for node matching criteria.
-TEST(SubgraphMatcher, IsNodeMatch) {
-  TestGraph graph;
-  auto n1 = graph.createNode("Hello");
-  auto n2 = graph.createNode("Le");
-  graph.createEdge(n1, n2);
-
-  EXPECT_TRUE(TestMatcher::isNodeMatch(n1, "Hello"));
-  EXPECT_FALSE(TestMatcher::isNodeMatch(n1, "G"));
-  EXPECT_TRUE(TestMatcher::isNodeMatch(n2, "Le"));
-  EXPECT_FALSE(TestMatcher::isNodeMatch(n2, "le"));
-}
-
-// Test subtree matching with a simple tree graph.
-TEST(SubgraphMatcher, IsSubtreeMatch) {
-  TestGraph graph;
-  auto n1 = graph.createNode("1");
-  auto n2 = graph.createNode("2");
-  auto n3 = graph.createNode("3");
-  auto n4 = graph.createNode("4");
-  auto n5 = graph.createNode("5");
-  auto n6 = graph.createNode("6");
-  auto n7 = graph.createNode("7");
-
-  graph.createEdge(n1, n2);
-  graph.createEdge(n2, n3);
-  graph.createEdge(n2, n4);
-  graph.createEdge(n1, n5);
-  graph.createEdge(n5, n6);
-  graph.createEdge(n5, n7);
-  /*       N1
-         /     \
-      N2         N5
-    /    \     /    \
-  N3     N4   N6   N7
-  */
-
-  auto subtree = tree(any(), {tree(any()), tree(any())});
-  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false));
-  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n4, subtree, false));
-
-  EXPECT_TRUE(TestMatcher::isSubtreeMatch(n2, subtree, false));
-  EXPECT_TRUE(TestMatcher::isSubtreeMatch(n5, subtree, false));
-
-  subtree = tree(Criteria("5"), {tree(any()), tree(any())});
-  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n2, subtree, false));
-  EXPECT_TRUE(TestMatcher::isSubtreeMatch(n5, subtree, false));
-
-  subtree = tree(any(), {tree(any()), tree(Criteria("4"))});
-  EXPECT_TRUE(TestMatcher::isSubtreeMatch(n2, subtree, false));
-  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n5, subtree, false));
-}
-
-// Test subtree matching in which * (repeated) matching of children is allowed.
-TEST(SubgraphMatcher, IsSubtreeMatchRepeated) {
-  TestGraph graph;
-  auto n1 = graph.createNode("1");
-  auto n2 = graph.createNode("2");
-  auto n3A = graph.createNode("3");
-  auto n3B = graph.createNode("3");
-  auto n4 = graph.createNode("4");
-  auto n5A = graph.createNode("5");
-  auto n5B = graph.createNode("5");
-  auto n5C = graph.createNode("5");
-  graph.createEdge(n1, n2);
-  graph.createEdge(n1, n3A);
-  graph.createEdge(n1, n3B);
-  graph.createEdge(n1, n4);
-  graph.createEdge(n1, n4);
-  graph.createEdge(n1, n5A);
-  graph.createEdge(n1, n5B);
-  graph.createEdge(n1, n5C);
-
-  auto subtree = tree(any(), {tree(Criteria("2"))});
-  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false));
-
-  subtree = tree(any(), {treeStar(Criteria("2"))});
-  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false));
-
-  // clang-format off
-  subtree = tree(any(), {
-    tree(Criteria("2")),
-    tree(Criteria("3"), {}, 2),
-    tree(Criteria("4"), {}, 2),
-    tree(Criteria("5"), {}, 3)
-  });
-  EXPECT_TRUE(TestMatcher::isSubtreeMatch(n1, subtree, false));
-
-  subtree = tree(any(), {
-    tree(Criteria("2")),
-    tree(Criteria("3"), {}, 2),
-    tree(Criteria("4"), {}, 2),
-    treeStar(Criteria("5"))
-  });
-  EXPECT_TRUE(TestMatcher::isSubtreeMatch(n1, subtree, false));
-
-  subtree = tree(any(), {
-    tree(Criteria("2")),
-    treeStar(Criteria("3")),
-    tree(Criteria("4"), {}, 2),
-    treeStar(Criteria("5"))
-  });
-  EXPECT_TRUE(TestMatcher::isSubtreeMatch(n1, subtree, false));
-
-  subtree = tree(any(), {
-    tree(Criteria("2")),
-    treeStar(Criteria("3")),
-  });
-  // Fails because there are unmatched edges.
-  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false));
-
-  subtree = tree(any(), {
-    tree(Criteria("2")),
-    tree(Criteria("3"), {}, 2),
-    tree(Criteria("4")),
-    tree(Criteria("5"), {}, 3)
-  });
-  // Fails because the count is wrong; we have 2 edges to node N4 while
-  // the pattern expects only 1.
-  EXPECT_FALSE(TestMatcher::isSubtreeMatch(n1, subtree, false));
-  // clang-format on
-}
-
-TEST(SubgraphMatcher, IsSubtreeMatchRealistic) {
-  auto graph = DataFlowTestGraph();
-  auto subtree = DataFlowTestGraphCriteria();
-
-  EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.opF, subtree));
-  EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.opC, subtree));
-  EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.opB, subtree));
-  EXPECT_FALSE(TestMatcher::isSubtreeMatch(graph.dataOut, subtree));
-
-  EXPECT_TRUE(TestMatcher::isSubtreeMatch(graph.opG, subtree));
-}
-
-TEST(SubgraphMatcher, ReplaceSubtreeRealistic) {
-  auto graph = DataFlowTestGraph();
-  auto subtree = DataFlowTestGraphCriteria();
-
-  TestMatcher::replaceSubtree(
-      graph.graph, subtree, [](TestGraph& g, TestGraph::NodeRef opG) {
-        auto opFused = g.createNode("opFused");
-
-        auto dataF = getInNode(opG, 0);
-        auto opF = getInNode(dataF, 0);
-        auto dataB = getInNode(opF, 0);
-        auto opB = getInNode(dataB, 0);
-        auto dataC = getInNode(opB, 0);
-        auto opC = getInNode(dataC, 0);
-
-        g.deleteNode(dataF);
-        g.replaceNode(opG, opFused);
-
-        auto outEdgesC = opC->getOutEdges();
-        g.deleteNode(outEdgesC[0]->head());
-        g.deleteNode(outEdgesC[1]->head());
-        g.replaceNode(opC, opFused);
-
-        g.deleteNode(opC);
-        g.deleteNode(opB);
-        g.deleteNode(dataB);
-        g.deleteNode(opF);
-        g.deleteNode(opG);
-
-        return true;
-      });
-
-  // Now the nodes are:
-  // - NumInputs input nodes
-  // - dataI node
-  // - fused node
-  // - output node
-  auto nodes = graph.graph.getMutableNodes();
-
-  // Test that the graph is transformed as expected.
-  EXPECT_EQ(nodes.size(), graph.numInputs + 3);
-  TestGraph::NodeRef opFused;
-  TestGraph::NodeRef dataI;
-  TestGraph::NodeRef dataOut;
-  for (auto node : nodes) {
-    if (node->data() == "opFused") {
-      opFused = node;
-    } else if (node->data() == "dataOut") {
-      dataOut = node;
-    } else if (node->data() == "dataI") {
-      dataI = node;
-    }
-  }
-
-  EXPECT_EQ(getInNode(dataOut, 0), opFused);
-  EXPECT_EQ(getInNode(opFused, 0), dataI);
-  for (int i = 1; i <= graph.numInputs; i++) {
-    EXPECT_EQ(getInNode(opFused, i)->data(), "input");
-  }
-
-  // Use nom::converters::convertToDotString(&graph.graph, TestGraphNodePrinter)
-  // to visualize. The transformed graph looks like This
-  /*
-
-                +---------++---------+
-                | input_A || input_D |
-                +---------++---------+
-                  |          |
-                  |          |
-                  v          v
-+---------+     +--------------------+     +---------+
-| input_B | --> |      opFused       | <-- | input_C |
-+---------+     +--------------------+     +---------+
-                  |          ^
-                  |          |
-                  v          |
-                +---------++---------+
-                | dataOut ||  dataI  |
-                +---------++---------+
-   */
-}
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 9f88f192936fe4..734d38d75e680d 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -323,10 +323,6 @@ class OperatorBase : public Observable<OperatorBase> {
     return !event_;
   }
 
-  virtual void SyncDevice() {
-    CAFFE_NOT_IMPLEMENTED;
-  }
-
   // Checks whether stream is ready to execute new computation,
   // used in stream allocation optimization to skip stream that is currently
   // busy. Depends on context and operator's device, returns true by default
@@ -581,8 +577,6 @@ class Operator : public OperatorBase {
     return &context_;
   }
 
-  void SyncDevice() final {}
-
   virtual std::vector<TensorFiller<Context>> InputFillers(
       const std::vector<std::vector<TIndex>>& shapes) {
     CAFFE_ENFORCE(shapes.size() == Inputs().size());
diff --git a/caffe2/core/operator_gpu.cc b/caffe2/core/operator_gpu.cc
deleted file mode 100644
index 03f227f7453524..00000000000000
--- a/caffe2/core/operator_gpu.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/operator.h"
-
-namespace caffe2 {
-
-template <>
-void Operator<CUDAContext>::SyncDevice() {
-  auto* context = getContext();
-  int device;
-  cudaGetDevice(&device);
-
-  cudaEvent_t ev;
-  cudaSetDevice(context->cuda_gpu_id());
-  cudaEventCreateWithFlags(&ev, cudaEventDisableTiming);
-  cudaEventRecord(ev, context->cuda_stream());
-  cudaEventSynchronize(ev);
-  cudaEventDestroy(ev);
-  cudaSetDevice(device);
-
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) {
-    CAFFE_THROW("Encountered CUDA error Stop: ", cudaGetErrorString(error));
-  }
-}
-
-} // namespace caffe2
diff --git a/caffe2/core/typeid.h b/caffe2/core/typeid.h
index facea9fa64d2fa..b4a01b57cc11e3 100644
--- a/caffe2/core/typeid.h
+++ b/caffe2/core/typeid.h
@@ -14,9 +14,8 @@
 
 #include <exception>
 
-#include "ATen/core/Half.h"
 #include "caffe2/core/common.h"
-#include "ATen/core/IdWrapper.h"
+#include "caffe2/utils/IdWrapper.h"
 
 namespace caffe2 {
 class CaffeTypeId;
@@ -33,16 +32,16 @@ class TypeMeta;
  * You need to register your types using CAFFE_KNOWN_TYPE(MyType) to be able to use CaffeTypeId with custom types.
  * This is for example used to store the dtype of tensors.
  */
-class CaffeTypeId final : public at::IdWrapper<CaffeTypeId, uint16_t> {
+class CaffeTypeId final : public c10::guts::IdWrapper<CaffeTypeId, uint16_t> {
 public:
   static CaffeTypeId createTypeId();
 
   friend std::ostream& ::operator<<(std::ostream& stream, CaffeTypeId typeId);
   friend bool operator<(CaffeTypeId lhs, CaffeTypeId rhs);
 
-  // This is 8, because 0 is uint8_t (due to ScalarType BC constraint)
+  // TODO Can we get rid of uninitialized?
   static constexpr CaffeTypeId uninitialized() {
-    return CaffeTypeId(8);
+    return CaffeTypeId(0);
   }
 
 private:
@@ -58,7 +57,7 @@ inline bool operator<(CaffeTypeId lhs, CaffeTypeId rhs) {
 
 }
 
-AT_DEFINE_HASH_FOR_IDWRAPPER(caffe2::CaffeTypeId)
+C10_DEFINE_HASH_FOR_IDWRAPPER(caffe2::CaffeTypeId)
 
 inline std::ostream& operator<<(std::ostream& stream, caffe2::CaffeTypeId typeId) {
   return stream << typeId.underlyingId();
@@ -440,41 +439,35 @@ inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
 
 class Tensor;
 
-// Note: we have preallocated the numbers 0-8 so they line up exactly
-// with at::ScalarType's numbering.  All other numbers do not matter.
-//
-// Notably, the "uninitialized" type id is 8, not 0, for hysterical raisins.
-
+// note: first preallocated id is 1, because 0 is used for uninitialized type
+// ids.
 struct _CaffeHighestPreallocatedTypeId final {};
 
-CAFFE_DECLARE_KNOWN_TYPE(0, uint8_t);
-CAFFE_DECLARE_KNOWN_TYPE(1, int8_t);
-CAFFE_DECLARE_KNOWN_TYPE(2, int16_t);
+CAFFE_DECLARE_KNOWN_TYPE(1, Tensor);
+CAFFE_DECLARE_KNOWN_TYPE(2, float);
 CAFFE_DECLARE_KNOWN_TYPE(3, int);
-CAFFE_DECLARE_KNOWN_TYPE(4, int64_t);
-CAFFE_DECLARE_KNOWN_TYPE(5, at::Half);
-CAFFE_DECLARE_KNOWN_TYPE(6, float);
-CAFFE_DECLARE_KNOWN_TYPE(7, double);
-// 8 = undefined type id
-
-CAFFE_DECLARE_KNOWN_TYPE(9, Tensor);
-CAFFE_DECLARE_KNOWN_TYPE(10, std::string);
-CAFFE_DECLARE_KNOWN_TYPE(11, bool);
-CAFFE_DECLARE_KNOWN_TYPE(12, uint16_t);
-CAFFE_DECLARE_KNOWN_TYPE(13, char);
-CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr<std::mutex>);
-CAFFE_DECLARE_KNOWN_TYPE(15, std::unique_ptr<std::atomic<bool>>);
-CAFFE_DECLARE_KNOWN_TYPE(16, std::vector<int32_t>);
-CAFFE_DECLARE_KNOWN_TYPE(17, std::vector<int64_t>);
-CAFFE_DECLARE_KNOWN_TYPE(18, std::vector<unsigned long>);
-CAFFE_DECLARE_KNOWN_TYPE(19, bool*);
-CAFFE_DECLARE_KNOWN_TYPE(20, char*);
-CAFFE_DECLARE_KNOWN_TYPE(21, int*);
+CAFFE_DECLARE_KNOWN_TYPE(4, std::string);
+CAFFE_DECLARE_KNOWN_TYPE(5, bool);
+CAFFE_DECLARE_KNOWN_TYPE(6, uint8_t);
+CAFFE_DECLARE_KNOWN_TYPE(7, int8_t);
+CAFFE_DECLARE_KNOWN_TYPE(8, uint16_t);
+CAFFE_DECLARE_KNOWN_TYPE(9, int16_t);
+CAFFE_DECLARE_KNOWN_TYPE(10, int64_t);
+CAFFE_DECLARE_KNOWN_TYPE(11, double);
+CAFFE_DECLARE_KNOWN_TYPE(12, char);
+CAFFE_DECLARE_KNOWN_TYPE(13, std::unique_ptr<std::mutex>);
+CAFFE_DECLARE_KNOWN_TYPE(14, std::unique_ptr<std::atomic<bool>>);
+CAFFE_DECLARE_KNOWN_TYPE(15, std::vector<int32_t>);
+CAFFE_DECLARE_KNOWN_TYPE(16, std::vector<int64_t>);
+CAFFE_DECLARE_KNOWN_TYPE(17, std::vector<unsigned long>);
+CAFFE_DECLARE_KNOWN_TYPE(18, bool*);
+CAFFE_DECLARE_KNOWN_TYPE(19, char*);
+CAFFE_DECLARE_KNOWN_TYPE(20, int*);
 
 #ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
-CAFFE_DECLARE_KNOWN_TYPE(22, long);
-CAFFE_DECLARE_KNOWN_TYPE(23, std::vector<long>);
+CAFFE_DECLARE_KNOWN_TYPE(21, long);
+CAFFE_DECLARE_KNOWN_TYPE(22, std::vector<long>);
 #endif // CAFFE2_UNIQUE_LONG_TYPEMETA
 
-CAFFE_DECLARE_KNOWN_TYPE(24, _CaffeHighestPreallocatedTypeId);
+CAFFE_DECLARE_KNOWN_TYPE(23, _CaffeHighestPreallocatedTypeId);
 }
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
index c7215e0ed28b32..35c2008d4fdab0 100644
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -21,7 +21,7 @@ class IDEEPContext final : public BaseContext {
     CAFFE_ENFORCE_EQ(option.device_type(), IDEEP);
   }
 
-  ~IDEEPContext() noexcept override {}
+  ~IDEEPContext() noexcept {}
 
   BaseStaticContext* GetStaticContext() const override {
     return GetIDEEPStaticContext();
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
index 755e1b5a57b8a9..45f55ab2407a2e 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -489,7 +489,7 @@ bool RunOnDevice() override {
         "noise_size", 491 /* prime to avoid artifacts */);
     // Treaded as half4 in the kernel, so need half4 here.
     noiseSize = divRoundUp(noiseSize, 4) * 4;
-    if (!noiseBlob->IsType<Tensor>(CPU) ||
+    if (!noiseBlob->IsType<TensorCPU>() ||
         noiseBlob->Get<TensorCPU>().size() != noiseSize) {
       VLOG(2) << "Initializing stylizer with noise: " << noiseSize;
       caffe2::Timer rt;
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
index bcf588d8a384f0..9f032e6fe299d0 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
@@ -94,7 +94,7 @@ void testMPSCNN() {
 
               Workspace ws;
               for (auto i = 0; i < N; ++i) {
-                auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+                auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
                 t->Resize(BS, C, H, W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -152,7 +152,7 @@ void testMPSCNN() {
 
         Workspace ws;
         for (auto i = 0; i < N; ++i) {
-          auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+          auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
           switch (ndim) {
             case 1:
               t->Resize(5);
@@ -210,7 +210,7 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNNormalizePlanarYUV Test: ";
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
           t->Resize(batch_size, channels, 8, 13);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
@@ -218,14 +218,14 @@ void testMPSCNN() {
         }
 
         {
-          auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+          auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
           t->Resize(1, channels);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
               t->size(), 0, 1, t->mutable_data<float>(), &ctx);
         }
         {
-          auto* t = ws.CreateBlob("stddev")->GetMutableTensor(CPU);
+          auto* t = ws.CreateBlob("stddev")->GetMutable<TensorCPU>();
           t->Resize(1, channels);
           CPUContext ctx;
           math::RandUniform<float, CPUContext>(
@@ -290,7 +290,7 @@ void testMPSCNN() {
           for (const auto dim : {10, 40}) {
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
               t->Resize(batchSize, channels, dim, dim);
               CPUContext ctx;
               // Too noisy.
@@ -299,7 +299,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+              auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
               t->Resize(channels);
               CPUContext ctx;
               for (auto i = 0; i < t->size(); ++i) {
@@ -310,7 +310,7 @@ void testMPSCNN() {
               // t->mutable_data<float>(), &ctx);
             }
             {
-              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+              auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
               t->Resize(channels);
               CPUContext ctx;
               for (auto i = 0; i < t->size(); ++i) {
@@ -321,7 +321,7 @@ void testMPSCNN() {
               // t->mutable_data<float>(), &ctx);
             }
             {
-              auto* t = ws.CreateBlob("pw")->GetMutableTensor(CPU);
+              auto* t = ws.CreateBlob("pw")->GetMutable<TensorCPU>();
               t->Resize(prelu == PreluTy::SHARED ? 1 : channels);
               CPUContext ctx;
               // Too noisy.
@@ -409,7 +409,7 @@ void testMPSCNN() {
           Workspace ws;
           const auto channels = array ? 12 : 3;
           {
-            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+            auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
             t->Resize(batch_size, channels, 8, 13);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -417,7 +417,7 @@ void testMPSCNN() {
           }
 
           {
-            auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+            auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
             t->Resize(shared ? channels : 1);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -480,7 +480,7 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNSpatialBN Test: " << channels;
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
           t->Resize(batch_size, channels, 8, 13);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
@@ -488,7 +488,7 @@ void testMPSCNN() {
         }
 
         for (const std::string name : {"scale", "bias", "mean", "var"}) {
-          auto* t = ws.CreateBlob(name)->GetMutableTensor(CPU);
+          auto* t = ws.CreateBlob(name)->GetMutable<TensorCPU>();
           t->Resize(channels);
           CPUContext ctx;
           // High mean to avoid var division by zero.
@@ -575,7 +575,7 @@ void testMPSCNN() {
               LOG(INFO) << "MPSCNNFC Test";
               Workspace ws;
               {
-                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
                 t->Resize(batchSize, CIn, H, W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -583,7 +583,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
                 t->Resize(COut, CIn * H * W);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -591,7 +591,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
                 t->Resize(COut);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -683,7 +683,7 @@ void testMPSCNN() {
                           Workspace ws;
                           {
                             auto* t =
-                                ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                                ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
                             t->Resize(batchSize, 8, 8, 13);
                             CPUContext ctx;
                             math::RandGaussian<float, CPUContext>(
@@ -784,7 +784,7 @@ void testMPSCNN() {
          std::vector<std::vector<size_t>>{{1, 3, 50, 80}, {1, 12, 50, 80}}) {
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
         t->Resize(dims);
         CPUContext ctx;
         math::RandGaussian<float, CPUContext>(
@@ -860,7 +860,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNPreprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
       t->Resize(1, 8, 13, 4);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -869,7 +869,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 100;
@@ -940,7 +940,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDeprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
       t->Resize(1, 3, 8, 24);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -949,7 +949,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 100;
@@ -999,7 +999,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDeprocess Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
       t->Resize(1, 3, 1280, 720);
       CPUContext ctx;
       for (auto i = 0; i < t->size(); ++i) {
@@ -1008,7 +1008,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("mean")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("mean")->GetMutable<TensorCPU>();
       t->Resize(3);
       CPUContext ctx;
       t->mutable_data<float>()[0] = 30;
@@ -1072,7 +1072,8 @@ void testMPSCNN() {
                       LOG(INFO) << "MPSCNNConv Test";
                       Workspace ws;
                       {
-                        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                        auto* t =
+                            ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
                         t->Resize(batchSize, 12, 57, 72);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1080,7 +1081,7 @@ void testMPSCNN() {
                       }
 
                       {
-                        auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                        auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
                         t->Resize(8, 12, kernel_h, kernel_w);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1092,7 +1093,7 @@ void testMPSCNN() {
                       }
 
                       {
-                        auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                        auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
                         t->Resize(8);
                         CPUContext ctx;
                         math::RandGaussian<float, CPUContext>(
@@ -1188,7 +1189,7 @@ void testMPSCNN() {
             Workspace ws;
             int output_channels = input_channels * channel_multiplier;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
               t->Resize(batchSize, input_channels, 57, 72);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1196,7 +1197,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+              auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
               t->Resize(output_channels, 1, 3, 3);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1204,7 +1205,7 @@ void testMPSCNN() {
             }
 
             {
-              auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+              auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
               t->Resize(output_channels);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -1275,7 +1276,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNConvRelu Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1283,7 +1284,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
       t->Resize(8, 12, 3, 3);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1291,7 +1292,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
       t->Resize(8);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1385,7 +1386,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSConv Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1393,7 +1394,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
       t->Resize(8, 12, 3, 3);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1401,7 +1402,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
       t->Resize(8);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1493,7 +1494,7 @@ void testMPSCNN() {
               LOG(INFO) << "MPSConv Test";
               Workspace ws;
               {
-                auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
                 t->Resize(batchSize, C, 12, 16);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1501,7 +1502,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
                 t->Resize(M, C, K, K);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1509,7 +1510,7 @@ void testMPSCNN() {
               }
 
               {
-                auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
                 t->Resize(M);
                 CPUContext ctx;
                 math::RandGaussian<float, CPUContext>(
@@ -1607,7 +1608,7 @@ void testMPSCNN() {
                 LOG(INFO) << "MPSCNNConv Test - group";
                 Workspace ws;
                 {
-                  auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                  auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
                   t->Resize(batchSize, C, 12, 16);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1615,7 +1616,7 @@ void testMPSCNN() {
                 }
 
                 {
-                  auto* t = ws.CreateBlob("W")->GetMutableTensor(CPU);
+                  auto* t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
                   t->Resize(M, C / group, K, K);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1623,7 +1624,7 @@ void testMPSCNN() {
                 }
 
                 {
-                  auto* t = ws.CreateBlob("b")->GetMutableTensor(CPU);
+                  auto* t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
                   t->Resize(M);
                   CPUContext ctx;
                   math::RandGaussian<float, CPUContext>(
@@ -1726,7 +1727,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNMul Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1734,7 +1735,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
       t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1791,7 +1792,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNSub Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1799,7 +1800,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
       t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1856,7 +1857,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1864,7 +1865,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1921,7 +1922,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X0_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -1929,7 +1930,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("X1_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -2011,7 +2012,7 @@ void testMPSCNN() {
       LOG(INFO) << "MPSCNNNeuron Test: " << n;
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
         t->Resize(1, 4, 12, 12);
         CPUContext ctx;
         math::RandGaussian<float, CPUContext>(
@@ -2065,7 +2066,7 @@ void testMPSCNN() {
     LOG(INFO) << "MPSCNNDropout Test";
     Workspace ws;
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
       t->Resize(1, 12, 57, 72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(
@@ -2136,7 +2137,7 @@ void testMPSCNN() {
                       << " - scale: " << scale;
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
               t->Resize(1, channels, 40, 40);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -2144,7 +2145,7 @@ void testMPSCNN() {
             }
             {
               // Use the batch-first encoding (n, [bbox])
-              auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
+              auto* t = ws.CreateBlob("R")->GetMutable<TensorCPU>();
               t->Resize(6, 5);
               for (auto i = 0; i < t->dim32(0); ++i) {
                 t->mutable_data<float>()[5 * i + 0] = 0; // batch
@@ -2250,14 +2251,14 @@ void testMPSCNN() {
         LOG(INFO) << "MPSCNNRoIWarp Test 2";
         Workspace ws;
         {
-          auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+          auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
           t->Resize(1, 8, 40, 40);
           CPUContext ctx;
           math::RandGaussian<float, CPUContext>(
               t->size(), 4, 2, t->mutable_data<float>(), &ctx);
         }
         {
-          auto* t = ws.CreateBlob("R")->GetMutableTensor(CPU);
+          auto* t = ws.CreateBlob("R")->GetMutable<TensorCPU>();
           t->Resize(6, 4);
           for (auto i = 0; i < t->dim32(0); ++i) {
             t->mutable_data<float>()[4 * i + 0] = (i % 4 + 1) * 1.0 / scale;
@@ -2362,7 +2363,7 @@ void testMPSCNN() {
             LOG(INFO) << "MPSCNNResizeNearestOp Test";
             Workspace ws;
             {
-              auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+              auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
               t->Resize(N, C, 37, 89);
               CPUContext ctx;
               math::RandGaussian<float, CPUContext>(
@@ -2497,7 +2498,7 @@ void testMPSCNN() {
     vector<float> im_info{60, 80, 0.166667};
     vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
     {
-      auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
       t->Resize(num_images, A, H, W);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = scores[i];
@@ -2505,7 +2506,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("bbox_delta_cpu")->GetMutable<TensorCPU>();
       t->Resize(num_images, 4 * A, H, W);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = bbx[i];
@@ -2513,7 +2514,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("im_info")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("im_info")->GetMutable<TensorCPU>();
       t->Resize(num_images, 3);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = im_info[i];
@@ -2521,7 +2522,7 @@ void testMPSCNN() {
     }
 
     {
-      auto* t = ws.CreateBlob("anchors")->GetMutableTensor(CPU);
+      auto* t = ws.CreateBlob("anchors")->GetMutable<TensorCPU>();
       t->Resize(A, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<float>()[i] = anchors[i];
@@ -2587,7 +2588,7 @@ void testMPSCNN() {
       LOG(INFO) << "MPSCNNSoftmax Test";
       Workspace ws;
       {
-        auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+        auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
         // Only works for spatial dimension of (1, 1) - weird.
         t->Resize(batchSize, 12, 1, 1);
         CPUContext ctx;
@@ -2661,8 +2662,8 @@ void testMPSCNN() {
                             LOG(INFO) << "MPSConvTranspose Test";
                             Workspace ws;
                             {
-                              auto* t =
-                                  ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+                              auto* t = ws.CreateBlob("X_cpu")
+                                            ->GetMutable<TensorCPU>();
                               t->Resize(batchSize, inputChannels, 8, 12);
                               CPUContext ctx;
                               math::RandGaussian<float, CPUContext>(
@@ -2675,7 +2676,7 @@ void testMPSCNN() {
 
                             {
                               auto* t =
-                                  ws.CreateBlob("W")->GetMutableTensor(CPU);
+                                  ws.CreateBlob("W")->GetMutable<TensorCPU>();
                               t->Resize(
                                   inputChannels,
                                   outputChannels,
@@ -2692,7 +2693,7 @@ void testMPSCNN() {
 
                             {
                               auto* t =
-                                  ws.CreateBlob("b")->GetMutableTensor(CPU);
+                                  ws.CreateBlob("b")->GetMutable<TensorCPU>();
                               t->Resize(outputChannels);
                               CPUContext ctx;
                               math::RandGaussian<float, CPUContext>(
@@ -2809,7 +2810,7 @@ void testMPSCNN() {
                     << batchSize;
           Workspace ws;
           for (auto i = 0; i < numInputs; ++i) {
-            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+            auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
             t->Resize(batchSize, array ? (i + 1) * 4 : 4, 10, 10);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -2891,7 +2892,7 @@ void testMPSCNN() {
           }
           Workspace ws;
           {
-            auto* t = ws.CreateBlob("X_cpu")->GetMutableTensor(CPU);
+            auto* t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
             t->Resize(batchSize, inputChannels, 53, 47);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -2964,7 +2965,7 @@ void testMPSCNN() {
                     << numInputs << ", " << batchSize;
           Workspace ws;
           for (auto i = 0; i < numInputs; ++i) {
-            auto* t = ws.CreateBlob(cpu(i))->GetMutableTensor(CPU);
+            auto* t = ws.CreateBlob(cpu(i))->GetMutable<TensorCPU>();
             t->Resize(batchSize, channelCount, 9, 17);
             CPUContext ctx;
             math::RandGaussian<float, CPUContext>(
@@ -3337,7 +3338,7 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
     cws.RunNetOnce(initNet);
     {
       auto* t =
-          cws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
+          cws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>();
       t->Resize(1, 224, 224, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<uint8_t>()[i] = i % 225;
@@ -3349,7 +3350,7 @@ void compareModels(const NetDef& initNet, NetDef predictNet) {
     mws.RunNetOnce(initNet);
     {
       auto* t =
-          mws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU);
+          mws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>();
       t->Resize(1, 224, 224, 4);
       for (auto i = 0; i < t->size(); ++i) {
         t->mutable_data<uint8_t>()[i] = i % 225;
@@ -3397,16 +3398,16 @@ void verifyRewrite(
   dumpDef(predictNet);
   dumpDef(metalPredictNet);
 
-#define RUN_NET(ws, predictNet)                                             \
-  ws.RunNetOnce(initNet);                                                   \
-  {                                                                         \
-    auto* t =                                                               \
-        ws.CreateBlob(predictNet.external_input(0))->GetMutableTensor(CPU); \
-    t->Resize(inputDims);                                                   \
-    CPUContext ctx;                                                         \
-    math::RandGaussian<float, CPUContext>(                                  \
-        t->size(), 0, 1, t->mutable_data<float>(), &ctx);                   \
-  }                                                                         \
+#define RUN_NET(ws, predictNet)                                               \
+  ws.RunNetOnce(initNet);                                                     \
+  {                                                                           \
+    auto* t =                                                                 \
+        ws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>(); \
+    t->Resize(inputDims);                                                     \
+    CPUContext ctx;                                                           \
+    math::RandGaussian<float, CPUContext>(                                    \
+        t->size(), 0, 1, t->mutable_data<float>(), &ctx);                     \
+  }                                                                           \
   ws.RunNetOnce(predictNet);
 
   // initialize
diff --git a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
index 2896bc26ac08d4..70b9ac05747511 100644
--- a/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
+++ b/caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h
@@ -41,7 +41,7 @@ void RowsWhereRoILevelEquals(Eigen::Ref<const ERArrXXf> rois,
 //    distribute those proposals to their appropriate FPN levels for Faster RCNN.
 //    An anchor at one FPN level may predict an RoI that will map to another
 //    level, hence the need to redistribute the proposals.
-// Reference: facebookresearch/Detectron/detectron/ops/collect_and_distribute_fpn_rpn_proposals.py
+// Reference: detectron/lib/ops/collect_and_distribute_fpn_rpn_proposals.py
 template <class Context>
 class CollectAndDistributeFpnRpnProposalsOp final : public Operator<Context> {
  public:
diff --git a/caffe2/operators/conv_op_cudnn.cc b/caffe2/operators/conv_op_cudnn.cc
index 2f11645f21c5cc..ddb0f8f89c144b 100644
--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@@ -602,12 +602,12 @@ bool CudnnConvOp::DoRunWithType() {
             kernel_w()));
       } else {
         vector<int> dims(filter.dims().begin(), filter.dims().end());
+        dims[0] /= group_;
 #if !CUDNN_VERSION_MIN(7, 0, 0)
-        // We only need to divide dims by group_ when CUDNN version < 7.0
-        // see CUDA group convolution doc: https://fburl.com/dgj6dvpd
         order_ == StorageOrder::NCHW ? dims[1] /= group_
                                      : dims[filter.ndim() - 1] /= group_;
 #endif
+        dims[filter.ndim() - 1] /= group_;
         CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
             filter_desc_,
             cudnnTypeWrapper<T_W>::type,
@@ -959,12 +959,10 @@ bool CudnnConvGradientOp::DoRunWithType() {
       } else {
         vector<int> dims(filter.dims().begin(), filter.dims().end());
 #if !CUDNN_VERSION_MIN(7, 0, 0)
-        // We only need to divide dims by group_ when CUDNN version < 7.0
-        // see CUDA group convolution doc: https://fburl.com/dgj6dvpd
+        dims[0] /= group_;
+#endif
         order_ == StorageOrder::NCHW ? dims[1] /= group_
                                      : dims[filter.ndim() - 1] /= group_;
-#endif
-
         CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
             filter_desc_,
             cudnnTypeWrapper<T_W>::type,
diff --git a/caffe2/operators/generate_proposals_op.h b/caffe2/operators/generate_proposals_op.h
index faf4936495244f..81f7d9ac43123f 100644
--- a/caffe2/operators/generate_proposals_op.h
+++ b/caffe2/operators/generate_proposals_op.h
@@ -59,7 +59,7 @@ ERMatXf ComputeAllAnchors(
 //     regression result 'deltas' as well as predefined bounding box shapes
 //     'anchors'. Greedy non-maximum suppression is applied to generate the
 //     final bounding boxes.
-// Reference: facebookresearch/Detectron/detectron/ops/generate_proposals.py
+// Reference: detectron/lib/ops/generate_proposals.py
 template <class Context>
 class GenerateProposalsOp final : public Operator<Context> {
  public:
diff --git a/caffe2/operators/generate_proposals_op_util_boxes.h b/caffe2/operators/generate_proposals_op_util_boxes.h
index 333514102b7d4b..0c4c345d382cb1 100644
--- a/caffe2/operators/generate_proposals_op_util_boxes.h
+++ b/caffe2/operators/generate_proposals_op_util_boxes.h
@@ -5,7 +5,7 @@
 #include "caffe2/utils/math.h"
 
 // Bounding box utils for generate_proposals_op
-// Reference: facebookresearch/Detectron/detectron/utils/boxes.py
+// Reference: detectron/lib/utils/boxes.py
 
 namespace caffe2 {
 namespace utils {
diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h
index 7b38cd6a1420d6..5d6f87d4d30563 100644
--- a/caffe2/operators/generate_proposals_op_util_nms.h
+++ b/caffe2/operators/generate_proposals_op_util_nms.h
@@ -19,7 +19,7 @@ namespace utils {
 // Reject a bounding box if its region has an intersection-overunion (IoU)
 //    overlap with a higher scoring selected bounding box larger than a
 //    threshold.
-// Reference: facebookresearch/Detectron/detectron/utils/cython_nms.pyx
+// Reference: detectron/lib/utils/cython_nms.pyx
 // proposals: pixel coordinates of proposed bounding boxes,
 //    size: (M, 4), format: [x1; y1; x2; y2]
 // scores: scores for each bounding box, size: (M, 1)
@@ -78,7 +78,7 @@ std::vector<int> nms_cpu_upright(
 
 /**
  * Soft-NMS implementation as outlined in https://arxiv.org/abs/1704.04503.
- * Reference: facebookresearch/Detectron/detectron/utils/cython_nms.pyx
+ * Reference: detectron/lib/utils/cython_nms.pyx
  * out_scores: Output updated scores after applying Soft-NMS
  * proposals: pixel coordinates of proposed bounding boxes,
  *    size: (M, 4), format: [x1; y1; x2; y2]
@@ -426,7 +426,7 @@ std::vector<int> nms_cpu(
 // Reject a bounding box if its region has an intersection-overunion (IoU)
 //    overlap with a higher scoring selected bounding box larger than a
 //    threshold.
-// Reference: facebookresearch/Detectron/detectron/lib/utils/cython_nms.pyx
+// Reference: detectron/lib/utils/cython_nms.pyx
 // proposals: pixel coordinates of proposed bounding boxes,
 //    size: (M, 4), format: [x1; y1; x2; y2]
 //    size: (M, 5), format: [ctr_x; ctr_y; w; h; angle (degrees)] for RRPN
diff --git a/caffe2/opt/converter.cc b/caffe2/opt/converter.cc
index 37d675eba83a49..b4866618b4e607 100644
--- a/caffe2/opt/converter.cc
+++ b/caffe2/opt/converter.cc
@@ -146,6 +146,9 @@ REGISTER_CONVERTER(SpatialBN, BatchNormalizationConverter);
 TRIVIAL_CONVERTER(Flatten);
 REGISTER_CONVERTER(Flatten, FlattenConverter);
 
+TRIVIAL_CONVERTER(BatchGather);
+REGISTER_CONVERTER(BatchGather, BatchGatherConverter);
+
 class AveragePoolConverter : public Converter {
   std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOperator(
       const OperatorDef& op) override {
@@ -202,6 +205,37 @@ class ConcatConverter : public Converter {
 };
 REGISTER_CONVERTER(Concat, ConcatConverter);
 
+class BatchMatMulConverter : public Converter {
+  std::unique_ptr<nom::repr::NeuralNetOperator> convertToNeuralNetOperator(
+      const OperatorDef& op) override {
+    std::unique_ptr<repr::NeuralNetOperator> nnOp =
+        util::make_unique<repr::BatchMatMul>();
+    auto argMap = getArgumentsFromOperator(op);
+
+    auto c = dyn_cast<repr::BatchMatMul>(nnOp.get());
+    if (argMap.count("trans_a")) {
+      CAFFE_ENFORCE(argMap["trans_a"].has_i(), "Invalid axis argument");
+      int trans_a = static_cast<int>(argMap["trans_a"].i());
+      c->setTransA(!!trans_a);
+    }
+    if (argMap.count("trans_b")) {
+      CAFFE_ENFORCE(argMap["trans_b"].has_i(), "Invalid add_axis argument");
+      int trans_b = static_cast<int>(argMap["trans_b"].i());
+      c->setTransB(!!trans_b);
+    }
+    if (argMap.count("broadcast")) {
+      CAFFE_ENFORCE(argMap["broadcast"].has_i(), "Invalid add_axis argument");
+      int broadcast = static_cast<int>(argMap["broadcast"].i());
+      c->setBroadcast(!!broadcast);
+    }
+    return nnOp;
+  }
+  // Does not override default converter to OperatorDef
+
+  virtual ~BatchMatMulConverter() {}
+};
+REGISTER_CONVERTER(BatchMatMul, BatchMatMulConverter);
+
 } // namespace
 
 std::unique_ptr<repr::NeuralNetOperator> convertToNeuralNetOperator(
@@ -236,6 +270,145 @@ std::unique_ptr<repr::NeuralNetOperator> convertToNeuralNetOperator(
   return nnOp;
 }
 
+void handleWhileOp(
+    repr::NNGraph& dfg,
+    repr::NNCFGraph& cfg,
+    repr::NNGraph::NodeRef& opNode,
+    repr::NNCFGraph::NodeRef& bbNode,
+    OperatorDef& op,
+    std::unordered_map<std::string, repr::NNGraph::NodeRef>& blobMap
+) {
+  opNode->resetData(util::make_unique<repr::While>());
+  auto argMap = Converter::getArgumentsFromOperator(op);
+  std::string bodyNetSerialized = argMap["body"].s();
+  auto bodyNet = caffe2::NetDef();
+  bodyNet.ParseFromString(bodyNetSerialized);
+
+  std::unordered_map<std::string, repr::NNGraph::NodeRef> bodyBlobMap;
+  auto bodyNN = convertToNNModule(bodyNet, &bodyBlobMap);
+  repr::NNGraph bodyGraph = std::move(bodyNN.dataFlow);
+  repr::NNCFGraph bodyCFGraph = std::move(bodyNN.controlFlow);
+
+  auto rev_sorted = algorithm::tarjans(&bodyGraph);
+
+  for (auto& k : bodyBlobMap) {
+    auto name = k.first;
+    if (blobMap.count(name)) {
+      auto oldNode = blobMap[name];
+      printf("Exit tensor %s is in the parent scope, inserting Phi node...\n", k.first.c_str());
+      auto phiNode = dfg.createNode(util::make_unique<repr::NNPhi>()); // NN variant of a Phi node
+      // Clone the operator.
+      auto tensor = dyn_cast<repr::NeuralNetData>(blobMap[name]->data().get());
+      auto* clonedTensor = tensor->clone();
+      auto phiOut = dfg.createNode(std::unique_ptr<repr::NeuralNetData>(clonedTensor));
+      dfg.createEdge(phiNode, phiOut);
+      dfg.createEdge(oldNode, phiNode);
+      dfg.createEdge(bodyBlobMap[name], phiNode);
+      blobMap[name] = phiOut;
+      for (auto& inEdge : opNode->getInEdges()) {
+        if (inEdge->tail() == oldNode) {
+          dfg.deleteEdge(inEdge);
+          dfg.createEdge(phiOut, opNode);
+        }
+      }
+    }
+  }
+
+  // Dependencies simply have no producers
+  std::unordered_map<repr::NNGraph::NodeRef, repr::NNGraph::NodeRef> inNodeMap;
+  for (auto& n : bodyGraph.getMutableNodes()) {
+    if (!isa<repr::NeuralNetData>(n->data())) { continue; }
+    if (n->getInEdges().size() == 0) {
+      auto name = dyn_cast<repr::NeuralNetData>(n->data().get())->getName();
+      // TODO(bwasti): this may be needed, depending on constraints
+      //assert(blobMap.count(name) != 0 && "Loop body takes undefined dependency.");
+      if (blobMap.count(name)) {
+        inNodeMap[n] = blobMap[name];
+      }
+    }
+  }
+
+  CAFFE_ENFORCE(rev_sorted.front().getNodes().size() == 1,
+      "More than one exit node.");
+  CAFFE_ENFORCE(rev_sorted.back().getNodes().size() == 1,
+      "More than one entry node.");
+
+  auto exit_tensor = *(rev_sorted.front().getNodes().begin());
+  CAFFE_ENFORCE(isa<repr::NeuralNetData>(exit_tensor->data()),
+      "Exit node is not a tensor.");
+
+  auto bodyNodes = bodyGraph.getMutableNodes();
+  auto bodyEdges = bodyGraph.getMutableEdges();
+
+  for (auto node : bodyNodes) {
+    bodyGraph.importNode(node, dfg);
+  }
+
+  for (auto edge : bodyEdges) {
+    bodyGraph.importEdge(edge, dfg);
+  }
+
+  // Merge all dependencies
+  for (auto node : dfg.getMutableNodes()) {
+    if (inNodeMap.count(node)) {
+      dfg.replaceNode(node, inNodeMap[node]);
+      dfg.deleteNode(node);
+    }
+  }
+
+  for (const auto& inEdge : opNode->getInEdges()) {
+    auto* inputData = dyn_cast<repr::NeuralNetData>(inEdge->tail()->data().get());
+    auto* exitData = dyn_cast<repr::NeuralNetData>(exit_tensor->data().get());
+    if (inputData->getName() == exitData->getName()) {
+      dfg.replaceNode(exit_tensor, inEdge->tail());
+      dfg.deleteNode(exit_tensor);
+    }
+  }
+
+  // CFG Handling
+  auto bodyCFNodes = bodyCFGraph.getMutableNodes();
+  auto bodyCFEdges = bodyCFGraph.getMutableEdges();
+
+  // Create a while loop CFG node.
+  auto whileBasicBlock = util::make_unique<repr::BasicBlockType<repr::NNGraph>>();
+  for (auto& inEdge : opNode->getInEdges()) {
+    auto node = inEdge->tail();
+    for (auto& parentInEdge : node->getInEdges()) {
+      auto parentNode = parentInEdge->tail();
+      if (isa<repr::Phi>(parentNode->data().get())) {
+        whileBasicBlock->pushInstructionNode(parentNode);
+      }
+    }
+  }
+  whileBasicBlock->pushInstructionNode(opNode);
+
+  auto whileCFNode = cfg.createNode(std::move(whileBasicBlock));
+  cfg.createEdge(bbNode, whileCFNode, 0);
+
+  // The true path executes the body of the loop, so we
+  // take that BB and point to it.
+  for (auto cfNode : bodyCFNodes) {
+    bodyCFGraph.importNode(cfNode, cfg);
+    // If the CFG node has no children, we loop back to the top of the
+    // while loop.
+    if (cfNode->getOutEdges().size() == 0) {
+      cfg.createEdge(cfNode, whileCFNode, 0);
+    }
+    // TODO check for a single entry point
+    if (cfNode->getInEdges().size() == 0) {
+      cfg.createEdge(whileCFNode, cfNode, 1);
+    }
+  }
+  for (auto cfEdge : bodyCFEdges) {
+    bodyCFGraph.importEdge(cfEdge, cfg);
+  }
+
+  // Now create the false case.
+  bbNode =
+    cfg.createNode(util::make_unique<repr::BasicBlockType<repr::NNGraph>>());
+  cfg.createEdge(whileCFNode, bbNode, -1);
+}
+
 
 /// \brief Ingest a caffe2 protobuf model and output an NNModule.
 /// \param net The caffe2 protobuf NetDef
@@ -282,9 +455,13 @@ repr::NNModule convertToNNModule(caffe2::NetDef &net, std::unordered_map<std::st
       blobMap[output] = tensorNode;
     }
 
-    opNode->resetData(convertToNeuralNetOperator(op));
-    auto currentBasicBlock = bbNode->mutableData()->get();
-    currentBasicBlock->pushInstructionNode(opNode);
+    if (op.type() == "While") {
+      handleWhileOp(dfg, cfg, opNode, bbNode, op, blobMap);
+    } else {
+      opNode->resetData(convertToNeuralNetOperator(op));
+      auto currentBasicBlock = bbNode->mutableData()->get();
+      currentBasicBlock->pushInstructionNode(opNode);
+    }
   }
 
   repr::NNModule module;
diff --git a/caffe2/opt/converter_nomigraph_test.cc b/caffe2/opt/converter_nomigraph_test.cc
index 0bab53f738d7c2..69f51df49cbf74 100644
--- a/caffe2/opt/converter_nomigraph_test.cc
+++ b/caffe2/opt/converter_nomigraph_test.cc
@@ -48,3 +48,65 @@ TEST(Converter, UnknownType) {
   auto new_netdef = caffe2::convertToCaffe2Proto(nn);
 }
 
+/* Temporarily disabled While conversion tests
+TEST(Converter, While) {
+  caffe2::NetDef net;
+
+  caffe2::OperatorDef *def = net.add_op();
+  def->set_type("While");
+  def->add_input("X");
+
+  caffe2::NetDef body_net;
+  {
+    caffe2::OperatorDef *rdef = body_net.add_op();
+    rdef->set_type("Relu");
+    rdef->add_input("X");
+    rdef->add_output("X");
+  }
+  std::string body_net_serialized;
+  assert(body_net.SerializeToString(&body_net_serialized));
+  ADD_ARG(def, "body", s, body_net_serialized);
+
+  auto nn = caffe2::convertToNNModule(net);
+}
+
+TEST(Converter, ComplexWhile) {
+  caffe2::NetDef net;
+
+  {
+    caffe2::OperatorDef *rdef = net.add_op();
+    rdef->set_type("Relu");
+    rdef->add_input("X");
+    rdef->add_output("X");
+  }
+
+  caffe2::OperatorDef *def = net.add_op();
+  def->set_type("While");
+  def->add_input("X");
+
+  caffe2::NetDef body_net;
+  {
+    caffe2::OperatorDef *rdef = body_net.add_op();
+    rdef->set_type("Instr1");
+    rdef->add_input("X");
+    rdef->add_output("X");
+  }
+  {
+    caffe2::OperatorDef *rdef = body_net.add_op();
+    rdef->set_type("Instr2");
+    rdef->add_input("X");
+    rdef->add_output("X");
+  }
+  {
+    caffe2::OperatorDef *rdef = body_net.add_op();
+    rdef->set_type("Instr3");
+    rdef->add_input("X");
+    rdef->add_output("X");
+  }
+  std::string body_net_serialized;
+  assert(body_net.SerializeToString(&body_net_serialized));
+  ADD_ARG(def, "body", s, body_net_serialized);
+
+  auto nn = caffe2::convertToNNModule(net);
+}
+*/
diff --git a/caffe2/opt/device.cc b/caffe2/opt/device.cc
index 0cfdd6c1dc91a3..9abca6d67e08b3 100644
--- a/caffe2/opt/device.cc
+++ b/caffe2/opt/device.cc
@@ -9,14 +9,15 @@ std::vector<NNGraph::EdgeRef> getInputEdges(
     const NNGraph::SubgraphType& sg,
     const NNGraph& g) {
   std::vector<NNGraph::EdgeRef> inputTensorEdges;
-  for (const auto& node : sg.getNodes()) {
+  for (const auto& node : sg.Nodes) {
     NOM_REQUIRE_OR_CONT(nn::is<NeuralNetOperator>(node));
     NOM_REQUIRE_OR_CONT(nn::hasInputs(node));
 
     // Check if tensor's parents are in the sg
     for (const auto& input : nn::getInputs(node)) {
       NOM_REQUIRE_OR_CONT(
-          !nn::hasProducer(input) || !sg.hasNode(nn::getProducer(input)));
+          !nn::hasProducer(input) ||
+          sg.Nodes.count(nn::getProducer(input)) == 0);
       inputTensorEdges.emplace_back(g.getEdge(input, node));
     }
   }
@@ -27,13 +28,13 @@ std::vector<NNGraph::EdgeRef> getOutputEdges(
     const NNGraph::SubgraphType& sg,
     const NNGraph& g) {
   std::vector<NNGraph::EdgeRef> outputTensorEdges;
-  for (const auto& node : sg.getNodes()) {
+  for (const auto& node : sg.Nodes) {
     NOM_REQUIRE_OR_CONT(nn::is<NeuralNetOperator>(node));
 
     for (const auto& output : nn::getOutputs(node)) {
       auto consumers = nn::getConsumers(output);
       for (const auto& consumer : consumers) {
-        NOM_REQUIRE_OR_CONT(!sg.hasNode(consumer));
+        NOM_REQUIRE_OR_CONT(sg.Nodes.count(consumer) == 0);
         outputTensorEdges.emplace_back(g.getEdge(node, output));
       }
       NOM_REQUIRE_OR_CONT(consumers.size() == 0);
diff --git a/caffe2/opt/fusion.cc b/caffe2/opt/fusion.cc
index f5ea0f678ed515..8a1b736399562a 100644
--- a/caffe2/opt/fusion.cc
+++ b/caffe2/opt/fusion.cc
@@ -1,6 +1,5 @@
-#include "caffe2/opt/fusion.h"
-#include "caffe2/core/logging.h"
 #include "caffe2/opt/converter.h"
+#include "caffe2/opt/fusion.h"
 #include "caffe2/opt/passes.h"
 
 namespace caffe2 {
@@ -19,25 +18,27 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
   for (auto convNode : repr::nn::nodeIterator<repr::Conv>(nn->dataFlow)) {
     auto output = repr::nn::getOutputs(convNode).front();
     auto consumers = repr::nn::getConsumers(output);
-    NOM_REQUIRE_OR_CONT(consumers.size() == 1);
-
+    if (consumers.size() != 1) {
+      continue;
+    }
     auto consumer = consumers.front();
-    NOM_REQUIRE_OR_CONT(repr::nn::is<repr::BatchNormalization>(consumer));
-
+    if (!repr::nn::is<repr::BatchNormalization>(consumer)) {
+      continue;
+    }
     auto bnNode = consumer;
     auto bn = repr::nn::get<repr::BatchNormalization>(bnNode);
-    auto bnOutputs = nn::getOutputs(bnNode);
-    NOM_REQUIRE_OR_CONT(bnOutputs.size() == 1);
-    auto bnOutput = bnOutputs.front();
 
     auto convInputs = repr::nn::getInputs(convNode);
-    CAFFE_ENFORCE(
-        convInputs.size() >= 3,
-        "Invalid convolution input size (TODO: optional bias)");
+    if (convInputs.size() < 3) {
+      assert(0 && "Invalid convolution input size (TODO: optional bias)");
+      continue;
+    }
 
     auto bnInputs = repr::nn::getInputs(bnNode);
-    CAFFE_ENFORCE(
-        bnInputs.size() >= 5, "Invalid batch normalization input size");
+    if (bnInputs.size() < 5) {
+      assert(0 && "Invalid batch normalization input size");
+      continue;
+    }
 
 #define EXPOSE_TENSOR_DATA(name, index, inputs)                            \
   auto name = repr::nn::get<repr::Tensor>(inputs[index]);                  \
@@ -68,8 +69,6 @@ bool fuseConvBNHelper(repr::NNModule* nn, caffe2::Workspace* ws) {
       biasConvData[c] = bias;
     }
 
-    nn->dataFlow.deleteNode(output);
-    nn->dataFlow.createEdge(convNode, bnOutput);
     nn->dataFlow.deleteNode(bnNode);
     return true;
   }
diff --git a/caffe2/opt/mobile.cc b/caffe2/opt/mobile.cc
index adbbbd19a1e367..6d0006818789bb 100644
--- a/caffe2/opt/mobile.cc
+++ b/caffe2/opt/mobile.cc
@@ -11,15 +11,23 @@ using namespace nom;
 
 void addNNPACK(repr::NNModule* nn, bool low_memory) {
   for (auto node : nn->dataFlow.getMutableNodes()) {
+    auto* nodeData = node->data().get(); // Let graph retain ownership.
+
     // Skip blobs.
-    NOM_REQUIRE_OR_CONT(repr::nn::is<repr::NeuralNetOperator>(node));
+    if (!isa<nom::repr::NeuralNetOperator>(nodeData)) {
+      continue;
+    }
 
     // Check if it is a convolution.
-    auto nnOp = repr::nn::get<repr::NeuralNetOperator>(node);
-    NOM_REQUIRE_OR_CONT(isa<nom::repr::Conv>(nnOp));
+    auto nnOp = dyn_cast<nom::repr::NeuralNetOperator>(nodeData);
+    if (!isa<nom::repr::Conv>(nnOp)) {
+      continue;
+    }
 
     // Requires X, W, b for NNPACK
-    NOM_REQUIRE_OR_CONT(node->getInEdges().size() >= 3);
+    if (node->getInEdges().size() < 3) {
+      continue;
+    }
 
     std::string engine = "NNPACK";
 
@@ -27,7 +35,9 @@ void addNNPACK(repr::NNModule* nn, bool low_memory) {
     bool validTransformCandidate = true;
     auto conv = dyn_cast<nom::repr::Conv>(nnOp);
 
-    NOM_REQUIRE_OR_CONT(conv->getLayout() == nom::repr::Conv::NNLayout::NCHW);
+    if (conv->getLayout() != nom::repr::Conv::NNLayout::NCHW) {
+      continue;
+    }
 
     // NNPACK only supports stride == 1
     for (auto stride : conv->getStrides()) {
@@ -36,21 +46,28 @@ void addNNPACK(repr::NNModule* nn, bool low_memory) {
         break;
       }
     }
-    NOM_REQUIRE_OR_CONT(validTransformCandidate);
+    if (!validTransformCandidate) {
+      continue;
+    }
 
     // NNPACK only supports 2DConv.
     const auto& kernelShape = conv->getKernelShape();
-    NOM_REQUIRE_OR_CONT(kernelShape.size() == 2);
+    if (kernelShape.size() != 2) {
+      continue;
+    }
 
     // Kx1 and 1xK convs are inefficient in NNPACK.
     if (kernelShape[0] != kernelShape[1]) {
-      NOM_REQUIRE_OR_CONT(kernelShape[0] != 1 && kernelShape[1] != 1);
+      if (kernelShape[0] == 1 || kernelShape[1] == 1) {
+        continue;
+      }
     }
 
     // We're good to use our engine.
     auto annotation = conv->getMutableAnnotation();
-    NOM_REQUIRE_OR_CONT(annotation && isa<Caffe2Annotation>(annotation));
-
+    if (!annotation || !isa<Caffe2Annotation>(annotation)) {
+      continue;
+    }
     auto* op = dyn_cast<Caffe2Annotation>(annotation)->getMutableOperatorDef();
     op->set_engine(engine);
     if (!low_memory) {
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index 09528b99b5da51..75baec0e9be66b 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -323,10 +323,8 @@ void OnnxifiTransformer::Transform(
 
   // function to tell whether the ONNXIFI backend supports a given C2 op or not
   // TODO: choose backend id
-  onnxifi_library* backend = lib_;
-  onnxBackendID backend_id = backend_ids_[0];
   auto supports =
-      [&exporter, &shape_hints, backend, backend_id](
+      [&exporter, &shape_hints, backend = lib_, backend_id = backend_ids_[0]](
           const caffe2::OperatorDef& op) {
         const OpSchema* schema = OpSchemaRegistry::Schema(op.type());
         // NB: this might not be a hard constraint as we can just export C2
diff --git a/caffe2/predictor/predictor.cc b/caffe2/predictor/predictor.cc
index 03264daf50f6a7..4c1e13d1008ac8 100644
--- a/caffe2/predictor/predictor.cc
+++ b/caffe2/predictor/predictor.cc
@@ -2,7 +2,6 @@
 #ifdef CAFFE2_OPTIMIZER
 #include "caffe2/opt/optimizer.h"
 #endif
-#include "caffe2/utils/proto_utils.h"
 
 #include <unordered_set>
 #include "caffe2/core/init.h"
@@ -97,9 +96,7 @@ Predictor::Predictor(
   GlobalInit();
 #endif
   auto predict_net = config_.predict_net;
-
-  if (optimization &&
-      !ArgumentHelper::HasArgument(*predict_net, "disable_nomnigraph")) {
+  if (optimization) {
 #ifdef CAFFE2_OPTIMIZER
     try {
       *predict_net = opt::optimize(*predict_net, &ws_, optimization);
diff --git a/caffe2/predictor/predictor.h b/caffe2/predictor/predictor.h
index 458bf4401476c4..a3f05d7aacac89 100644
--- a/caffe2/predictor/predictor.h
+++ b/caffe2/predictor/predictor.h
@@ -28,7 +28,7 @@ class Predictor {
       const NetDef& run_net,
       Workspace* parent = nullptr,
       bool run_init = true,
-      int optimization = 1);
+      int optimization = 0);
 
   ~Predictor() {}
 
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index dd1734a587c1fc..d10bfe209f7b39 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -630,7 +630,7 @@ def _dense_gftrl(alpha, beta, lambda1, lambda2, w, nz, g):
            beta=st.floats(min_value=0.1, max_value=0.9),
            lambda1=st.floats(min_value=0.001, max_value=0.1),
            lambda2=st.floats(min_value=0.001, max_value=0.1),
-           engine=st.sampled_from([None, "SIMD"]),
+           engine=st.sampled_from([None]),
            **hu.gcs_cpu_only)
     def test_gftrl_sgd(self, inputs, in_place, alpha, beta, lambda1, lambda2,
                       engine, gc, dc):
diff --git a/caffe2/python/models/seq2seq/translate.py b/caffe2/python/models/seq2seq/translate.py
index d2b6a4f6399fff..b1c0e1cd885ea4 100644
--- a/caffe2/python/models/seq2seq/translate.py
+++ b/caffe2/python/models/seq2seq/translate.py
@@ -5,12 +5,10 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-from abc import ABCMeta, abstractmethod
 import argparse
 from future.utils import viewitems
 import logging
 import numpy as np
-from six import with_metaclass
 import sys
 
 from caffe2.python import core, rnn_cell, workspace
@@ -33,60 +31,7 @@ def _weighted_sum(model, values, weight, output_name):
     )
 
 
-class Seq2SeqModelCaffe2EnsembleDecoderBase(with_metaclass(ABCMeta, object)):
-
-    @abstractmethod
-    def get_model_file(self, model):
-        pass
-
-    @abstractmethod
-    def get_db_type(self):
-        pass
-
-    def build_word_rewards(self, vocab_size, word_reward, unk_reward):
-        word_rewards = np.full([vocab_size], word_reward, dtype=np.float32)
-        word_rewards[seq2seq_util.PAD_ID] = 0
-        word_rewards[seq2seq_util.GO_ID] = 0
-        word_rewards[seq2seq_util.EOS_ID] = 0
-        word_rewards[seq2seq_util.UNK_ID] = word_reward + unk_reward
-        return word_rewards
-
-    def load_models(self):
-        db_reader = 'reader'
-        for model, scope_name in zip(
-            self.models,
-            self.decoder_scope_names,
-        ):
-            params_for_current_model = [
-                param
-                for param in self.model.GetAllParams()
-                if str(param).startswith(scope_name)
-            ]
-            assert workspace.RunOperatorOnce(core.CreateOperator(
-                'CreateDB',
-                [], [db_reader],
-                db=self.get_model_file(model),
-                db_type=self.get_db_type())
-            ), 'Failed to create db {}'.format(self.get_model_file(model))
-            assert workspace.RunOperatorOnce(core.CreateOperator(
-                'Load',
-                [db_reader],
-                params_for_current_model,
-                load_all=1,
-                add_prefix=scope_name + '/',
-                strip_prefix='gpu_0/',
-            ))
-            logger.info('Model {} is loaded from a checkpoint {}'.format(
-                scope_name, self.get_model_file(model)))
-
-
-class Seq2SeqModelCaffe2EnsembleDecoder(Seq2SeqModelCaffe2EnsembleDecoderBase):
-
-    def get_model_file(self, model):
-        return model['model_file']
-
-    def get_db_type(self):
-        return 'minidb'
+class Seq2SeqModelCaffe2EnsembleDecoder(object):
 
     def scope(self, scope_name, blob_name):
         return (
@@ -313,6 +258,14 @@ def _build_decoder(
             attention_weights,
         )
 
+    def build_word_rewards(self, vocab_size, word_reward, unk_reward):
+        word_rewards = np.full([vocab_size], word_reward, dtype=np.float32)
+        word_rewards[seq2seq_util.PAD_ID] = 0
+        word_rewards[seq2seq_util.GO_ID] = 0
+        word_rewards[seq2seq_util.EOS_ID] = 0
+        word_rewards[seq2seq_util.UNK_ID] = word_reward + unk_reward
+        return word_rewards
+
     def __init__(
         self,
         translate_params,
@@ -461,6 +414,36 @@ def __init__(
         for param in self.model.params:
             logger.info(param)
 
+    def load_models(self):
+        db_reader = 'reader'
+        for model, scope_name in zip(
+            self.models,
+            self.decoder_scope_names,
+        ):
+            params_for_current_model = [
+                param
+                for param in self.model.GetAllParams()
+                if str(param).startswith(scope_name)
+            ]
+            assert workspace.RunOperatorOnce(core.CreateOperator(
+                'CreateDB',
+                [], [db_reader],
+                db=model['model_file'],
+                db_type='minidb')
+            ), 'Failed to create db {}'.format(model['model_file'])
+            assert workspace.RunOperatorOnce(core.CreateOperator(
+                'Load',
+                [db_reader],
+                params_for_current_model,
+                load_all=1,
+                add_prefix=scope_name + '/',
+                strip_prefix='gpu_0/',
+            ))
+            logger.info('Model {} is loaded from a checkpoint {}'.format(
+                scope_name,
+                model['model_file'],
+            ))
+
     def decode(self, numberized_input, max_output_seq_len):
         workspace.FeedBlob(
             self.encoder_inputs,
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index 93e45704fcfea6..dab79b8b1fb0b4 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -35,7 +35,6 @@
 import onnx.defs
 import onnx.optimizer
 import onnx.shape_inference
-import onnx.utils
 from onnx.backend.base import Backend, Device, DeviceType, namedtupledict
 
 from caffe2.python.onnx.workspace import Workspace
@@ -877,7 +876,6 @@ def _graph_to_net(cls, onnx_graph, opset_version):
     def _onnx_model_to_caffe2_net(cls, onnx_model, device, opset_version, include_initializers):
         device_option = get_device_option(Device(device))
 
-        onnx_model = onnx.utils.polish_model(onnx_model)
         init_model = cls.optimize_onnx(onnx_model, init=True)
         pred_model = cls.optimize_onnx(onnx_model, predict=True)
 
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index ee60d776d55a82..db870972f83946 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -1421,8 +1421,7 @@ def build_ftrl(model, engine="SIMD", **kwargs):
 
 
 def build_gftrl(model, engine="", **kwargs):
-    if engine == "SIMD":
-        assert core.IsOperator('GFtrl_ENGINE_SIMD')
+    # SIMD version of GFTRL is not supported
     gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs)
     return _build(model, gftrl_optimizer)
 
diff --git a/caffe2/python/predictor/mobile_exporter.py b/caffe2/python/predictor/mobile_exporter.py
index 3c42c2073163cd..07f88def015544 100644
--- a/caffe2/python/predictor/mobile_exporter.py
+++ b/caffe2/python/predictor/mobile_exporter.py
@@ -20,7 +20,6 @@ def add_tensor(net, name, blob):
         np.dtype('int32'): "GivenTensorIntFill",
         np.dtype('int64'): "GivenTensorInt64Fill",
         np.dtype('uint8'): "GivenTensorStringFill",
-        np.dtype('O'): "GivenTensorStringFill"
     }
 
     shape = blob.shape
@@ -30,12 +29,6 @@ def add_tensor(net, name, blob):
     if blob.dtype == np.dtype('uint8'):
         shape = [1]
         values = [str(blob.data)]
-    # Only allow string arrays as objects.
-    # The only intended use case for this is to store arrays of strings in the
-    # model which can be used for post processing results in subsequent ops.
-    if blob.dtype == np.dtype('O'):
-        for blob_val in blob:
-            assert(isinstance(blob_val, bytes))
 
     op = core.CreateOperator(
         kTypeNameMapper[blob.dtype],
diff --git a/caffe2/python/predictor/mobile_exporter_test.py b/caffe2/python/predictor/mobile_exporter_test.py
index 1c4cf77ea0512f..e7bbe2c90351c4 100644
--- a/caffe2/python/predictor/mobile_exporter_test.py
+++ b/caffe2/python/predictor/mobile_exporter_test.py
@@ -73,15 +73,11 @@ def test_mobile_exporter_datatypes(self):
         model = ModelHelper(name="mobile_exporter_test_model")
         model.Copy("data_int", "out")
         model.params.append("data_int")
-        model.Copy("data_obj", "out_obj")
-        model.params.append("data_obj")
 
         # Create our mobile exportable networks
         workspace.RunNetOnce(model.param_init_net)
         np_data_int = np.random.randint(100, size=(1, 1, 28, 28), dtype=np.int32)
         workspace.FeedBlob("data_int", np_data_int)
-        np_data_obj = np.array(['aa', 'bb']).astype(np.dtype('O'))
-        workspace.FeedBlob("data_obj", np_data_obj)
 
         init_net, predict_net = mobile_exporter.Export(
             workspace, model.net, model.params
@@ -90,7 +86,6 @@ def test_mobile_exporter_datatypes(self):
         workspace.CreateNet(model.net)
         workspace.RunNet(model.net)
         ref_out = workspace.FetchBlob("out")
-        ref_out_obj = workspace.FetchBlob("out_obj")
 
         # Clear the workspace
         workspace.ResetWorkspace()
@@ -102,11 +97,9 @@ def test_mobile_exporter_datatypes(self):
         workspace.CreateNet(predict_net, True)
         workspace.RunNet(predict_net.name)
         manual_run_out = workspace.FetchBlob("out")
-        manual_run_out_obj = workspace.FetchBlob("out_obj")
         np.testing.assert_allclose(
             ref_out, manual_run_out, atol=1e-10, rtol=1e-10
         )
-        np.testing.assert_equal(ref_out_obj, manual_run_out_obj)
 
         # Clear the workspace
         workspace.ResetWorkspace()
@@ -116,17 +109,11 @@ def test_mobile_exporter_datatypes(self):
             init_net.SerializeToString(), predict_net.SerializeToString()
         )
 
-        # Output is a vector of outputs.
+        # Output is a vector of outputs but we only care about the first and only result
         predictor_out = predictor.run([])
-        assert len(predictor_out) == 2
-        predictor_out_int = predictor_out[1]
-        predictor_out_obj = predictor_out[0]
-        # The order in predictor_out is non-deterministic. Use type of the entry
-        # to figure out what to compare it to.
-        if isinstance(predictor_out[1][0], bytes):
-            predictor_out_int = predictor_out[0]
-            predictor_out_obj = predictor_out[1]
+        assert len(predictor_out) == 1
+        predictor_out = predictor_out[0]
+
         np.testing.assert_allclose(
-            ref_out, predictor_out_int, atol=1e-10, rtol=1e-10
+            ref_out, predictor_out, atol=1e-10, rtol=1e-10
         )
-        np.testing.assert_equal(ref_out_obj, predictor_out_obj)
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
index 6e66cd75315716..2edc88ce0458d4 100644
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@@ -179,7 +179,6 @@ def test_transformer_SinkMaxPool(self):
         epsilon=st.floats(min_value=1e-5, max_value=1e-2),
     )
     def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon):
-        workspace.ResetWorkspace()
         net = core.Net("net")
         c = input_channels
         h = size
@@ -205,22 +204,16 @@ def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon
         workspace.FeedBlob("scale", np.random.rand(c).astype(np.float32))
         workspace.FeedBlob("bias", np.random.rand(c).astype(np.float32))
         workspace.FeedBlob("mean", np.random.rand(c).astype(np.float32))
-        # This is necessary because 1/sqrt(var) is used and if var is too small
-        # we get floating point artifacts that cause test failures
-        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32) + 0.5)
+        workspace.FeedBlob("var", np.random.rand(c).astype(np.float32))
         workspace.RunNetOnce(net)
-        preTransformOutput = workspace.FetchBlob("Y2").flatten()
-        workspace.FeedBlob("Y2", np.zeros((1, 1)))
+        preTransformOutput = workspace.FetchBlob("Y2")
         transformer.FuseConvBN(net)
 
         # Ensure fusion
         assert len(net.Proto().op) == 1
         workspace.RunNetOnce(net)
-        postTransformOutput = workspace.FetchBlob("Y2").flatten()
+        postTransformOutput = workspace.FetchBlob("Y2")
         # Check that there is no numerical difference
         assert np.allclose(
-            preTransformOutput,
-            postTransformOutput,
-            rtol=1e-02,
-            atol=1e-04
+            preTransformOutput, postTransformOutput, rtol=1e-05, atol=1e-08
         )
diff --git a/caffe2/requirements.txt b/caffe2/requirements.txt
index 07fd95b72582a2..9a1d67efc7c2f3 100644
--- a/caffe2/requirements.txt
+++ b/caffe2/requirements.txt
@@ -1,4 +1,2 @@
 numpy
 enum34
-pyyaml
-typing
diff --git a/caffe2/utils/Array.h b/caffe2/utils/Array.h
index ad9a80ed9203b5..921deb9b0b41aa 100644
--- a/caffe2/utils/Array.h
+++ b/caffe2/utils/Array.h
@@ -38,10 +38,10 @@
 
 #pragma once
 
-#include <ATen/core/C++17.h>
+#include <utility>
 #include <stdexcept>
 #include <string>
-#include <utility>
+#include "caffe2/utils/C++17.h"
 
 namespace c10 { namespace guts {
 
@@ -101,32 +101,32 @@ class array final {
   // No explicit construct/copy/destroy for aggregate type.
 
   // DR 776.
-  AT_CPP14_CONSTEXPR void fill(const value_type& __u)
+  C10_CPP14_CONSTEXPR void fill(const value_type& __u)
   { std::fill_n(begin(), size(), __u); }
 
-  AT_CPP14_CONSTEXPR void swap(array& __other)
+  C10_CPP14_CONSTEXPR void swap(array& __other)
   { std::swap_ranges(begin(), end(), __other.begin()); }
 
   // Iterators.
-  AT_CPP14_CONSTEXPR iterator begin() noexcept
+  C10_CPP14_CONSTEXPR iterator begin() noexcept
   { return iterator(data()); }
 
   constexpr const_iterator begin() const noexcept
   { return const_iterator(data()); }
 
-  AT_CPP14_CONSTEXPR iterator end() noexcept
+  C10_CPP14_CONSTEXPR iterator end() noexcept
   { return iterator(data() + _Nm); }
 
   constexpr const_iterator end() const noexcept
   { return const_iterator(data() + _Nm); }
 
-  AT_CPP14_CONSTEXPR reverse_iterator rbegin() noexcept
+  C10_CPP14_CONSTEXPR reverse_iterator rbegin() noexcept
   { return reverse_iterator(end()); }
 
   constexpr const_reverse_iterator rbegin() const noexcept
   { return const_reverse_iterator(end()); }
 
-  AT_CPP14_CONSTEXPR reverse_iterator rend() noexcept
+  C10_CPP14_CONSTEXPR reverse_iterator rend() noexcept
   { return reverse_iterator(begin()); }
 
   constexpr const_reverse_iterator rend() const noexcept
@@ -152,13 +152,13 @@ class array final {
   constexpr bool empty() const noexcept { return size() == 0; }
 
   // Element access.
-  AT_CPP14_CONSTEXPR reference operator[](size_type __n) noexcept
+  C10_CPP14_CONSTEXPR reference operator[](size_type __n) noexcept
   { return _AT_Type::_S_ref(_M_elems, __n); }
 
   constexpr const_reference operator[](size_type __n) const noexcept
   { return _AT_Type::_S_ref(_M_elems, __n); }
 
-  AT_CPP14_CONSTEXPR reference at(size_type __n) {
+  C10_CPP14_CONSTEXPR reference at(size_type __n) {
     if (__n >= _Nm) {
       detail::__throw_out_of_range(std::string() +
           "array::at: __n (which is " + to_string(__n) + ") " +
@@ -177,13 +177,13 @@ class array final {
          _AT_Type::_S_ref(_M_elems, 0));
      }
 
-  AT_CPP14_CONSTEXPR reference front() noexcept
+  C10_CPP14_CONSTEXPR reference front() noexcept
   { return *begin(); }
 
   constexpr const_reference front() const noexcept
   { return _AT_Type::_S_ref(_M_elems, 0); }
 
-  AT_CPP14_CONSTEXPR reference back() noexcept
+  C10_CPP14_CONSTEXPR reference back() noexcept
   { return _Nm ? *(end() - 1) : *end(); }
 
   constexpr const_reference back() const noexcept
@@ -192,7 +192,7 @@ class array final {
             : _AT_Type::_S_ref(_M_elems, 0);
   }
 
-  AT_CPP14_CONSTEXPR pointer data() noexcept
+  C10_CPP14_CONSTEXPR pointer data() noexcept
   { return _AT_Type::_S_ptr(_M_elems); }
 
   constexpr const_pointer data() const noexcept
diff --git a/caffe2/utils/C++17.cpp b/caffe2/utils/C++17.cpp
new file mode 100644
index 00000000000000..d75d9fc9dff490
--- /dev/null
+++ b/caffe2/utils/C++17.cpp
@@ -0,0 +1 @@
+#include "caffe2/utils/C++17.h"
diff --git a/aten/src/ATen/core/C++17.h b/caffe2/utils/C++17.h
similarity index 93%
rename from aten/src/ATen/core/C++17.h
rename to caffe2/utils/C++17.h
index 5112d9070dcd5e..0186944e251159 100644
--- a/aten/src/ATen/core/C++17.h
+++ b/caffe2/utils/C++17.h
@@ -95,14 +95,10 @@ template<class T> using decay_t = typename std::decay<T>::type;
 
 #ifdef __cpp_lib_logical_traits
 
-template <class... B>
-using conjunction = std::conjunction<B...>;
-template <class... B>
-using disjunction = std::disjunction<B...>;
-template <bool B>
-using bool_constant = std::bool_constant<B>;
-template <class B>
-using negation = std::negation<B>;
+using conjunction = std::conjunction;
+using disjunction = std::disjunction;
+using bool_constant = std::bool_constant;
+using negation = std::negation;
 
 #else
 
@@ -149,10 +145,7 @@ template<typename... Ts> using void_t = typename make_void<Ts...>::type;
 
 #ifdef __cpp_lib_apply
 
-template <class F, class Tuple>
-inline constexpr decltype(auto) apply(F&& f, Tuple&& t) {
-  return std::apply(std::forward<F>(f), std::forward<Tuple>(t));
-}
+using apply = std::apply;
 
 #else
 
@@ -182,9 +175,9 @@ constexpr auto apply(F&& f, Tuple&& t) -> decltype(detail::apply_impl(
 
 
 #if defined(__cpp_constexpr) && __cpp_constexpr >= 201304
-#  define AT_CPP14_CONSTEXPR constexpr
+#  define C10_CPP14_CONSTEXPR constexpr
 #else
-#  define AT_CPP14_CONSTEXPR
+#  define C10_CPP14_CONSTEXPR
 #endif
 
 
diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt
index 67897c36fe485a..5db06663bf6403 100644
--- a/caffe2/utils/CMakeLists.txt
+++ b/caffe2/utils/CMakeLists.txt
@@ -63,6 +63,8 @@ set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS}
 
 set(LIB_SOURCES_CPU
         Array.cpp
+        C++17.cpp
+        IdWrapper.cpp
         Optional.cpp
         Metaprogramming.cpp
         TypeList.cpp
diff --git a/caffe2/utils/IdWrapper.cpp b/caffe2/utils/IdWrapper.cpp
new file mode 100644
index 00000000000000..7646a1392d4a6b
--- /dev/null
+++ b/caffe2/utils/IdWrapper.cpp
@@ -0,0 +1 @@
+#include "caffe2/utils/IdWrapper.h"
diff --git a/caffe2/utils/IdWrapper.h b/caffe2/utils/IdWrapper.h
new file mode 100644
index 00000000000000..0c8e548ca017f6
--- /dev/null
+++ b/caffe2/utils/IdWrapper.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <functional>
+
+namespace c10 { namespace guts {
+
+/**
+ * This template simplifies generation of simple classes that wrap an id
+ * in a typesafe way. Namely, you can use it to create a very lightweight
+ * type that only offers equality comparators and hashing. Example:
+ *
+ *   struct MyIdType final : IdWrapper<MyIdType, uint32_t> {
+ *     constexpr explicit MyIdType(uint32_t id): IdWrapper(id) {}
+ *   };
+ *
+ * Then in the global top level namespace:
+ *
+ *   C10_DEFINE_IDWRAPPER(MyIdType);
+ *
+ * That's it - equality operators and hash functions are automatically defined
+ * for you, given the underlying type supports it.
+ */
+template <class ConcreteType, class UnderlyingType>
+class IdWrapper {
+public:
+    using underlying_type = UnderlyingType;
+    using concrete_type = ConcreteType;
+
+protected:
+    constexpr explicit IdWrapper(underlying_type id) noexcept(noexcept(underlying_type(std::declval<underlying_type>())))
+        : id_(id) {}
+
+    constexpr underlying_type underlyingId() const noexcept(noexcept(underlying_type(std::declval<underlying_type>()))) {
+        return id_;
+    }
+
+private:
+    friend size_t hash_value(const concrete_type& v) {
+        return std::hash<underlying_type>()(v.id_);
+    }
+
+    // TODO Making operator== noexcept if underlying type is noexcept equality comparable doesn't work with GCC 4.8.
+    //      Fix this once we don't need GCC 4.8 anymore.
+    friend constexpr bool operator==(const concrete_type& lhs, const concrete_type& rhs) {
+        return lhs.id_ == rhs.id_;
+    }
+
+    // TODO Making operator!= noexcept if operator== is noexcept doesn't work with GCC 4.8.
+    //      Fix this once we don't need GCC 4.8 anymore.
+    friend constexpr bool operator!=(const concrete_type& lhs, const concrete_type& rhs) {
+        return !(lhs == rhs);
+    }
+
+    underlying_type id_;
+};
+
+}}
+
+#define C10_DEFINE_HASH_FOR_IDWRAPPER(ClassName)             \
+  namespace std {                                            \
+  template <>                                                \
+  struct hash<ClassName> {                                   \
+    size_t operator()(ClassName x) const {                   \
+      return hash_value(x);                                  \
+    }                                                        \
+  };                                                         \
+  }
diff --git a/caffe2/utils/TypeList.h b/caffe2/utils/TypeList.h
index 7c20fa6613b966..3494843feae121 100644
--- a/caffe2/utils/TypeList.h
+++ b/caffe2/utils/TypeList.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/core/C++17.h>
+#include "caffe2/utils/C++17.h"
 #include "caffe2/utils/TypeTraits.h"
 
 namespace c10 { namespace guts { namespace typelist {
diff --git a/caffe2/utils/TypeTraits.h b/caffe2/utils/TypeTraits.h
index c60f8a00b1ebdd..004586987a81f7 100644
--- a/caffe2/utils/TypeTraits.h
+++ b/caffe2/utils/TypeTraits.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/core/C++17.h>
+#include "caffe2/utils/C++17.h"
 #include <functional>
 
 namespace c10 {
diff --git a/caffe2/utils/math_cpu.cc b/caffe2/utils/math_cpu.cc
index c573542af5763c..e0ae5cc0336e2a 100644
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@@ -2605,13 +2605,6 @@ bool TransposeWithHPTT(
     axes_cm[i] = cm_fn(axes[cm_fn(i)]);
     dims_cm[i] = dims[cm_fn(i)];
   }
-
-  // HPTT doesn't handle 0 sized inputs.
-  for (auto dim : dims_cm) {
-    if (dim <= 0) {
-      return false;
-    }
-  }
   auto plan = hptt::create_plan(
       axes_cm.data(),
       ndim,
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 3829219a933b5d..bc30f35f2a2eee 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -1,9 +1,3 @@
-# This ill-named file does a number of things:
-# - Installs Caffe2 header files (this has nothing to do with code generation)
-# - Configures caffe2/core/macros.h
-# - Creates an ATen target for its generated C++ files and adds it
-#   as a dependency
-
 if (DEFINED ENV{PYTORCH_PYTHON})
   message(STATUS "Using python found in $ENV{PYTORCH_PYTHON}")
   set(PYCMD "$ENV{PYTORCH_PYTHON}")
@@ -20,11 +14,6 @@ configure_file(
 install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2
         DESTINATION include
         FILES_MATCHING PATTERN "*.h")
-if (NOT BUILD_ATEN)
-  install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core
-          DESTINATION include/ATen/core
-          FILES_MATCHING PATTERN "*.h")
-endif()
 install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h
         DESTINATION include/caffe2/core)
 
diff --git a/cmake/MiscCheck.cmake b/cmake/MiscCheck.cmake
index 2f2628bb149866..2a4e61f97b0b18 100644
--- a/cmake/MiscCheck.cmake
+++ b/cmake/MiscCheck.cmake
@@ -83,26 +83,22 @@ endif()
 cmake_pop_check_state()
 
 # ---[ Check for NUMA support
-if (USE_NUMA)
-  cmake_push_check_state(RESET)
-  set(CMAKE_REQUIRED_FLAGS "-std=c++11")
-  CHECK_CXX_SOURCE_COMPILES(
+cmake_push_check_state(RESET)
+set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+CHECK_CXX_SOURCE_COMPILES(
     "#include <numa.h>
     #include <numaif.h>
 
     int main(int argc, char** argv) {
     }" CAFFE2_IS_NUMA_AVAILABLE)
-  if (CAFFE2_IS_NUMA_AVAILABLE)
-    message(STATUS "NUMA is available")
-  else()
-    message(STATUS "NUMA is not available")
-    set(CAFFE2_DISABLE_NUMA 1)
-  endif()
-  cmake_pop_check_state()
+
+if (CAFFE2_IS_NUMA_AVAILABLE)
+  message(STATUS "NUMA is available")
 else()
-  message(STATUS "NUMA is disabled")
+  message(STATUS "NUMA is not available")
   set(CAFFE2_DISABLE_NUMA 1)
 endif()
+cmake_pop_check_state()
 
 # ---[ Check if we want to turn off deprecated warning due to glog.
 # Note(jiayq): on ubuntu 14.04, the default glog install uses ext/hash_set that
@@ -161,15 +157,6 @@ if (${COMPILER_SUPPORTS_HIDDEN_INLINE_VISIBILITY})
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CAFFE2_VISIBILITY_FLAG}")
 endif()
 
-# ---[ Checks if linker supports -rdynamic. `-rdynamic` tells linker
-# -to add all (including unused) symbols into the dynamic symbol
-# -table. We need this to get symbols when generating backtrace at
-# -runtime.
-check_cxx_compiler_flag("-rdynamic" COMPILER_SUPPORTS_RDYNAMIC)
-if (${COMPILER_SUPPORTS_RDYNAMIC})
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -rdynamic")
-endif()
-
 # ---[ If we are using msvc, set no warning flags
 # Note(jiayq): if you are going to add a warning flag, check if this is
 # totally necessary, and only add when you see fit. If it is needed due to
diff --git a/docs/libtorch.rst b/docs/libtorch.rst
deleted file mode 100644
index 9ab59a4d749d66..00000000000000
--- a/docs/libtorch.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-libtorch (C++-only)
-===================
-
-The core of pytorch can be built and used without Python. A
-CMake-based build system compiles the C++ source code into a shared
-object, libtorch.so.
-
-Building libtorch
------------------
-
-There is a script which wraps the CMake build. Invoke it with
-
-::
-   cd pytorch
-   BUILD_TORCH=ON ONNX_NAMESPACE=onnx_torch bash tools/build_pytorch_libs.sh --use-nnpack caffe2
-   ls torch/lib/tmp_install # output is produced here
-   ls torch/lib/tmp_install/lib/libtorch.so # of particular interest
-
-Future work will simplify this further.
diff --git a/docs/source/distributions.rst b/docs/source/distributions.rst
index de541b467e819e..93224462e3177e 100644
--- a/docs/source/distributions.rst
+++ b/docs/source/distributions.rst
@@ -203,15 +203,6 @@ Probability distributions - torch.distributions
     :undoc-members:
     :show-inheritance:
 
-:hidden:`NegativeBinomial`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. currentmodule:: torch.distributions.negative_binomial
-.. autoclass:: NegativeBinomial
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
 :hidden:`Normal`
 ~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 283409ea3676b8..987044bbd212f4 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -338,12 +338,6 @@ Non-linear activations (weighted sum, nonlinearity)
 .. autoclass:: SELU
     :members:
 
-:hidden:`CELU`
-~~~~~~~~~~~~~~
-
-.. autoclass:: CELU
-    :members:
-
 :hidden:`Sigmoid`
 ~~~~~~~~~~~~~~~~~
 
@@ -610,12 +604,6 @@ Loss functions
 .. autoclass:: CrossEntropyLoss
     :members:
 
-:hidden:`CTCLoss`
-~~~~~~~~~~~~~~~~~
-
-.. autoclass:: CTCLoss
-    :members:
-
 :hidden:`NLLLoss`
 ~~~~~~~~~~~~~~~~~
 
@@ -996,11 +984,6 @@ Non-linear activation functions
 
 .. autofunction:: selu
 
-:hidden:`celu`
-~~~~~~~~~~~~~~
-
-.. autofunction:: celu
-
 :hidden:`leaky_relu`
 ~~~~~~~~~~~~~~~~~~~~
 
@@ -1197,11 +1180,6 @@ Loss functions
 
 .. autofunction:: cross_entropy
 
-:hidden:`ctc_loss`
-~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: ctc_loss
-
 :hidden:`hinge_embedding_loss`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/scripts/build_activation_images.py b/docs/source/scripts/build_activation_images.py
index e973933e205692..ce424d1ff188fa 100644
--- a/docs/source/scripts/build_activation_images.py
+++ b/docs/source/scripts/build_activation_images.py
@@ -36,7 +36,6 @@
     'ReLU6',
     'RReLU',
     'SELU',
-    'CELU',
     'Sigmoid',
     'Softplus',
     'Softshrink',
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 06b0305d28aae8..c3c85797b4cd82 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -46,7 +46,7 @@ A tensor can be constructed from a Python :class:`list` or sequence using the
     If you have a numpy array and want to avoid a copy, use
     :func:`torch.as_tensor`.
 
-A tensor of specific data type can be constructed by passing a
+An tensor of specific data type can be constructed by passing a
 :class:`torch.dtype` and/or a :class:`torch.device` to a
 constructor or tensor creation op:
 
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index c68ec039d74ce3..c1e914c03c74e7 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -306,7 +306,3 @@ BLAS and LAPACK Operations
 .. autofunction:: svd
 .. autofunction:: symeig
 .. autofunction:: trtrs
-
-Utilities
-----------------------------------
-.. autofunction:: compiled_with_cxx11_abi
diff --git a/scripts/build_anaconda.sh b/scripts/build_anaconda.sh
index 62185d1e9dc821..1db0f546724103 100755
--- a/scripts/build_anaconda.sh
+++ b/scripts/build_anaconda.sh
@@ -296,10 +296,6 @@ fi
 # Add packages required for all Caffe2 builds
 add_package 'glog'
 add_package 'gflags'
-add_package 'mkl' '>=2018'
-add_package 'mkl-include'
-add_package 'typing'
-append_to_section 'build' '- pyyaml'
 caffe2_cmake_args+=("-DUSE_LEVELDB=OFF")
 caffe2_cmake_args+=("-DUSE_LMDB=OFF")
 
@@ -307,6 +303,10 @@ caffe2_cmake_args+=("-DUSE_LMDB=OFF")
 # Add packages required for pytorch
 if [[ -n $integrated ]]; then
   add_package 'cffi'
+  add_package 'mkl' '>=2018'
+  add_package 'mkl-include'
+  add_package 'typing'
+  append_to_section 'build' '- pyyaml'
   append_to_section 'build' '- setuptools'
   #caffe2_cmake_args+=("-DBLAS=MKL")
   if [[ -n $cuda_ver ]]; then
diff --git a/setup.py b/setup.py
index 2e2ef60fb41313..042d8668bb7b96 100644
--- a/setup.py
+++ b/setup.py
@@ -659,9 +659,7 @@ def run(self):
         # Clang has an unfixed bug leading to spurious missing
         # braces warnings, see
         # https://bugs.llvm.org/show_bug.cgi?id=21629
-        '-Wno-missing-braces',
-        # gcc7 seems to report spurious warnings with this enabled
-        "-Wno-stringop-overflow",
+        '-Wno-missing-braces'
     ]
     if check_env_flag('WERROR'):
         extra_compile_args.append('-Werror')
@@ -1025,7 +1023,6 @@ def make_relative_rpath(path):
                 'lib/torch_shm_manager',
                 'lib/*.h',
                 'lib/include/ATen/*.h',
-                'lib/include/ATen/core/*.h',
                 'lib/include/ATen/detail/*.h',
                 'lib/include/ATen/cuda/*.h',
                 'lib/include/ATen/cuda/*.cuh',
diff --git a/setup_caffe2.py b/setup_caffe2.py
index d8ebf4fc7ed84f..0fd620549b31d8 100644
--- a/setup_caffe2.py
+++ b/setup_caffe2.py
@@ -131,7 +131,6 @@ def run(self):
             # configure
             cmake_args = [
                 find_executable('cmake'),
-                '-DUSE_ATEN=ON',
                 '-DBUILD_SHARED_LIBS=OFF',
                 '-DPYTHON_EXECUTABLE:FILEPATH={}'.format(sys.executable),
                 '-DPYTHON_INCLUDE_DIR={}'.format(sysconfig.get_python_inc()),
diff --git a/test/common.py b/test/common.py
index 4dbe3c56c47c98..1eb4076dbf360b 100644
--- a/test/common.py
+++ b/test/common.py
@@ -118,6 +118,16 @@ def dec(fn):
     return dec
 
 
+def skipIfNoZeroSize(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if torch._C._use_zero_size_dim():
+            fn(*args, **kwargs)
+        else:
+            raise unittest.SkipTest('Compiled without arbitrary zero size dimension support')
+    return wrapper
+
+
 def get_cuda_memory_usage():
     # we don't need CUDA synchronize because the statistics are not tracked at
     # actual freeing, but at when marking the block as free.
diff --git a/test/common_nn.py b/test/common_nn.py
index 0444ba4eb6ae46..6172f4b15adc3f 100644
--- a/test/common_nn.py
+++ b/test/common_nn.py
@@ -125,7 +125,6 @@ def get_weight(m):
         module_name='ELU',
         constructor_args=(2.,),
         input_size=(3, 2, 5),
-        reference_fn=lambda x, _: torch.where(x >= 0, x, 2 * (x.exp() - 1))
     ),
     # TODO: reference function
     dict(
@@ -449,43 +448,6 @@ def marginrankingloss_reference(input1, input2, target, margin=0, reduction='ele
     return output
 
 
-# this directly follows Graves et al's paper, in contrast to the production implementation, it does not use log-space
-def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0, reduction='elementwise_mean'):
-    input_lengths = torch.tensor(input_lengths, dtype=torch.long)
-    target_lengths = torch.tensor(target_lengths, dtype=torch.long)
-    dt = log_probs.dtype
-    log_probs = log_probs.double()  # we need the accuracy as we are not in logspace
-    targets = targets.long()
-    cum_target_lengths = target_lengths.cumsum(0)
-    losses = []
-    for i in range(log_probs.size(1)):
-        input_length = input_lengths[i].item()
-        target_length = target_lengths[i].item()
-        cum_target_length = cum_target_lengths[i].item()
-        targets_prime = targets.new_full((2 * target_length + 1,), blank)
-        if targets.dim() == 2:
-            targets_prime[1::2] = targets[i, :target_length]
-        else:
-            targets_prime[1::2] = targets[cum_target_length - target_length:cum_target_length]
-        probs = log_probs[:input_length, i].exp()
-        alpha = log_probs.new_zeros((target_length * 2 + 1,))
-        alpha[0] = probs[0, blank]
-        alpha[1] = probs[0, targets_prime[1]]
-        mask_third = (targets_prime[:-2] != targets_prime[2:])
-        for t in range(1, input_length):
-            alpha_next = alpha.clone()
-            alpha_next[1:] += alpha[:-1]
-            alpha_next[2:] += torch.where(mask_third, alpha[:-2], alpha.new_zeros(1))
-            alpha = probs[t, targets_prime] * alpha_next
-        losses.append(-alpha[-2:].sum().log()[None])
-    output = torch.cat(losses, 0)
-    if reduction == 'elementwise_mean':
-        return (output / target_lengths.to(dtype=output.dtype, device=output.device)).mean()
-    elif reduction == 'sum':
-        return output.sum()
-    output = output.to(dt)
-    return output
-
 loss_reference_fns = {
     'KLDivLoss': kldivloss_reference,
     'NLLLoss': nllloss_reference,
@@ -498,7 +460,6 @@ def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0
     'CosineEmbeddingLoss': cosineembeddingloss_reference,
     'TripletMarginLoss': tripletmarginloss_reference,
     'MarginRankingLoss': marginrankingloss_reference,
-    'CTCLoss': ctcloss_reference,
 }
 
 
@@ -880,7 +841,7 @@ def check_criterion_jacobian(self, criterion, input, target):
 
 class TestBase(object):
 
-    _required_arg_names = {'constructor_args', 'input', 'extra_args'}
+    _required_arg_names = {'constructor_args', 'input'}
 
     def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwargs):
         self.desc = desc
@@ -889,8 +850,8 @@ def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwa
         self.reference_fn = reference_fn
         for name in self._required_arg_names:
             if name not in kwargs and name + '_fn' not in kwargs and name + '_size' not in kwargs:
-                if name in {'constructor_args', 'extra_args'}:
-                    kwargs[name] = tuple()
+                if name == 'constructor_args':
+                    kwargs['constructor_args'] = tuple()
                 else:
                     raise ValueError("{}: Specify {} by a value, a function to generate it, or it's size!"
                                      .format(self.get_name(), name))
@@ -918,10 +879,6 @@ def _unpack(self, value):
     def constructor_args(self):
         return self._get_arg('constructor_args', True)
 
-    @property
-    def extra_args(self):
-        return self._get_arg('extra_args', True)
-
     def _get_arg(self, name, unpack):
         assert name in self._required_arg_names
 
@@ -1146,9 +1103,9 @@ def __call__(self, test_case):
         target = self._get_target()
 
         if self.reference_fn is not None:
-            out = test_case._forward_criterion(module, input, target, extra_args=self.extra_args)
-            ref_args = (deepcopy(input), deepcopy(target)) + self.extra_args + (module,)
-            expected_out = self.reference_fn(*ref_args)
+            out = test_case._forward_criterion(module, input, target)
+            expected_out = self.reference_fn(deepcopy(input),
+                                             deepcopy(target), module)
             if isinstance(expected_out, torch.Tensor):
                 expected_out = expected_out.item()
             test_case.assertEqual(out, expected_out)
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index be2fd6e0d969ba..8e66a66962d44f 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -237,7 +237,7 @@ TEST_CASE("modules") {
       REQUIRE(functional(torch::ones({}) * -1).toCFloat() == 0);
     }
     {
-      auto functional = Functional(torch::elu, /*alpha=*/1, /*scale=*/0, /*input_scale=*/1);
+      auto functional = Functional(torch::elu, /*alpha=*/1, /*scale=*/0);
       REQUIRE(functional(torch::ones({})).toCFloat() == 0);
     }
   }
diff --git a/test/expect/TestBatched.test_for.expect b/test/expect/TestBatched.test_for.expect
deleted file mode 100644
index bcbcffaee486a3..00000000000000
--- a/test/expect/TestBatched.test_for.expect
+++ /dev/null
@@ -1,22 +0,0 @@
-graph(%x.1_data : Dynamic
-      %x.1_mask : Dynamic
-      %x.1_dims : Dynamic
-      %y_data : Dynamic
-      %y_mask : Dynamic
-      %y_dims : Dynamic) {
-  %6 : int = prim::Constant[value=10]()
-  %7 : int = prim::Constant[value=1]()
-  %x : Dynamic, %21 : Dynamic, %22 : Dynamic = prim::Loop(%6, %7, %x.1_data, %x.1_mask, %x.1_dims)
-    block0(%loop_num : int, %5_data : Dynamic, %5_mask : Dynamic, %5_dims : Dynamic) {
-      %13 : int = prim::Constant[value=1]()
-      %14 : Long() = prim::NumToTensor(%13)
-      %alpha : float = prim::TensorToNum(%14)
-      %data.1 : Dynamic = aten::add(%5_data, %y_data, %alpha)
-      %mask : Dynamic = aten::mul(%5_mask, %y_mask)
-      %dims : Dynamic = aten::__or__(%5_dims, %y_dims)
-      %19 : int = prim::Constant[value=1]()
-      %data : Dynamic = aten::where(%mask, %data.1, %5_data)
-      -> (%19, %data, %mask, %dims)
-    }
-  return (%x, %21, %22);
-}
diff --git a/test/expect/TestBatched.test_if_else.expect b/test/expect/TestBatched.test_if_else.expect
deleted file mode 100644
index 0698584377a433..00000000000000
--- a/test/expect/TestBatched.test_if_else.expect
+++ /dev/null
@@ -1,52 +0,0 @@
-graph(%a.1_data : Dynamic
-      %a.1_mask : Dynamic
-      %a.1_dims : Dynamic
-      %b_data : Dynamic
-      %b_mask : Dynamic
-      %b_dims : Dynamic) {
-  %6 : Dynamic = aten::gt(%a.1_data, %b_data)
-  %7 : Dynamic = aten::mul(%a.1_mask, %b_mask)
-  %8 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %9 : int = prim::TensorToNum(%6)
-  %10 : int = prim::Constant[value=1]()
-  %11 : Long() = prim::NumToTensor(%10)
-  %alpha.1 : float = prim::TensorToNum(%11)
-  %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha.1)
-  %mask.1 : Dynamic = aten::mul(%a.1_mask, %b_mask)
-  %dims.1 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %16 : int = prim::Constant[value=1]()
-  %17 : Long() = prim::NumToTensor(%16)
-  %alpha : float = prim::TensorToNum(%17)
-  %data.4 : Dynamic = aten::sub(%a.1_data, %b_data, %alpha)
-  %mask : Dynamic = aten::mul(%a.1_mask, %b_mask)
-  %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %22 : Dynamic = aten::type_as(%7, %6)
-  %cond_mask.1 : Dynamic = aten::mul(%6, %22)
-  %24 : int = aten::dim(%cond_mask.1)
-  %25 : int = prim::Constant[value=1]()
-  %26 : int = aten::eq(%24, %25)
-  %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%26)
-    block0() {
-      %30 : int = aten::dim(%data.1)
-      %31 : int = prim::Constant[value=1]()
-      %32 : int = aten::sub(%30, %31)
-      %33 : int = prim::Constant[value=1]()
-      %data.3 : Dynamic = prim::Loop(%32, %33, %cond_mask.1)
-        block0(%_ : int, %36 : Dynamic) {
-          %37 : int = aten::dim(%36)
-          %data.2 : Dynamic = aten::unsqueeze(%36, %37)
-          %39 : int = prim::Constant[value=1]()
-          -> (%39, %data.2)
-        }
-      %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
-      %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask.1)
-      -> (%cond_data.1, %cond_mask.2, %data.3)
-    }
-    block1() {
-      -> (%cond_mask.1, %cond_mask.1, %cond_mask.1)
-    }
-  %res_data : Dynamic = aten::where(%cond_data, %data.1, %data.4)
-  %res_mask : Dynamic = aten::where(%cond_mask, %mask.1, %mask)
-  %res_dims : Dynamic = aten::__or__(%dims.1, %dims)
-  return (%res_data, %res_mask, %res_dims);
-}
diff --git a/test/expect/TestBatched.test_if_else_with_scalar.expect b/test/expect/TestBatched.test_if_else_with_scalar.expect
deleted file mode 100644
index c7755a5b5501fc..00000000000000
--- a/test/expect/TestBatched.test_if_else_with_scalar.expect
+++ /dev/null
@@ -1,53 +0,0 @@
-graph(%a.1_data : Dynamic
-      %a.1_mask : Dynamic
-      %a.1_dims : Dynamic
-      %b_data : Dynamic
-      %b_mask : Dynamic
-      %b_dims : Dynamic) {
-  %6 : float = prim::Constant[value=0.1]()
-  %7 : Float() = prim::NumToTensor(%6)
-  %other : float = prim::TensorToNum(%7)
-  %9 : Dynamic = aten::gt(%a.1_data, %other)
-  %10 : int = prim::TensorToNum(%9)
-  %11 : int = prim::Constant[value=1]()
-  %12 : Long() = prim::NumToTensor(%11)
-  %alpha.1 : float = prim::TensorToNum(%12)
-  %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha.1)
-  %mask.1 : Dynamic = aten::mul(%a.1_mask, %b_mask)
-  %dims.1 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %17 : int = prim::Constant[value=1]()
-  %18 : Long() = prim::NumToTensor(%17)
-  %alpha : float = prim::TensorToNum(%18)
-  %data.4 : Dynamic = aten::sub(%a.1_data, %b_data, %alpha)
-  %mask : Dynamic = aten::mul(%a.1_mask, %b_mask)
-  %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %23 : Dynamic = aten::type_as(%a.1_mask, %9)
-  %cond_mask.1 : Dynamic = aten::mul(%9, %23)
-  %25 : int = aten::dim(%cond_mask.1)
-  %26 : int = prim::Constant[value=1]()
-  %27 : int = aten::eq(%25, %26)
-  %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%27)
-    block0() {
-      %31 : int = aten::dim(%data.1)
-      %32 : int = prim::Constant[value=1]()
-      %33 : int = aten::sub(%31, %32)
-      %34 : int = prim::Constant[value=1]()
-      %data.3 : Dynamic = prim::Loop(%33, %34, %cond_mask.1)
-        block0(%_ : int, %37 : Dynamic) {
-          %38 : int = aten::dim(%37)
-          %data.2 : Dynamic = aten::unsqueeze(%37, %38)
-          %40 : int = prim::Constant[value=1]()
-          -> (%40, %data.2)
-        }
-      %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
-      %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask.1)
-      -> (%cond_data.1, %cond_mask.2, %data.3)
-    }
-    block1() {
-      -> (%cond_mask.1, %cond_mask.1, %cond_mask.1)
-    }
-  %res_data : Dynamic = aten::where(%cond_data, %data.1, %data.4)
-  %res_mask : Dynamic = aten::where(%cond_mask, %mask.1, %mask)
-  %res_dims : Dynamic = aten::__or__(%dims.1, %dims)
-  return (%res_data, %res_mask, %res_dims);
-}
diff --git a/test/expect/TestBatched.test_if_noelse.expect b/test/expect/TestBatched.test_if_noelse.expect
deleted file mode 100644
index 1d98fe9d02f29c..00000000000000
--- a/test/expect/TestBatched.test_if_noelse.expect
+++ /dev/null
@@ -1,46 +0,0 @@
-graph(%a.1_data : Dynamic
-      %a.1_mask : Dynamic
-      %a.1_dims : Dynamic
-      %b_data : Dynamic
-      %b_mask : Dynamic
-      %b_dims : Dynamic) {
-  %6 : Dynamic = aten::gt(%a.1_data, %b_data)
-  %7 : Dynamic = aten::mul(%a.1_mask, %b_mask)
-  %8 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %9 : int = prim::TensorToNum(%6)
-  %10 : int = prim::Constant[value=1]()
-  %11 : Long() = prim::NumToTensor(%10)
-  %alpha : float = prim::TensorToNum(%11)
-  %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha)
-  %mask : Dynamic = aten::mul(%a.1_mask, %b_mask)
-  %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %16 : Dynamic = aten::type_as(%7, %6)
-  %cond_mask.1 : Dynamic = aten::mul(%6, %16)
-  %18 : int = aten::dim(%cond_mask.1)
-  %19 : int = prim::Constant[value=1]()
-  %20 : int = aten::eq(%18, %19)
-  %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%20)
-    block0() {
-      %24 : int = aten::dim(%data.1)
-      %25 : int = prim::Constant[value=1]()
-      %26 : int = aten::sub(%24, %25)
-      %27 : int = prim::Constant[value=1]()
-      %data.3 : Dynamic = prim::Loop(%26, %27, %cond_mask.1)
-        block0(%_ : int, %30 : Dynamic) {
-          %31 : int = aten::dim(%30)
-          %data.2 : Dynamic = aten::unsqueeze(%30, %31)
-          %33 : int = prim::Constant[value=1]()
-          -> (%33, %data.2)
-        }
-      %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
-      %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask)
-      -> (%cond_data.1, %cond_mask.2, %data.3)
-    }
-    block1() {
-      -> (%cond_mask.1, %cond_mask.1, %cond_mask.1)
-    }
-  %res_data : Dynamic = aten::where(%cond_data, %data.1, %a.1_data)
-  %res_mask : Dynamic = aten::where(%cond_mask, %mask, %a.1_mask)
-  %res_dims : Dynamic = aten::__or__(%dims, %a.1_dims)
-  return (%res_data, %res_mask, %res_dims);
-}
diff --git a/test/expect/TestBatched.test_if_noelse_with_scalar.expect b/test/expect/TestBatched.test_if_noelse_with_scalar.expect
deleted file mode 100644
index 935bedb22b3f80..00000000000000
--- a/test/expect/TestBatched.test_if_noelse_with_scalar.expect
+++ /dev/null
@@ -1,47 +0,0 @@
-graph(%a.1_data : Dynamic
-      %a.1_mask : Dynamic
-      %a.1_dims : Dynamic
-      %b_data : Dynamic
-      %b_mask : Dynamic
-      %b_dims : Dynamic) {
-  %6 : float = prim::Constant[value=0.1]()
-  %7 : Float() = prim::NumToTensor(%6)
-  %other : float = prim::TensorToNum(%7)
-  %9 : Dynamic = aten::gt(%a.1_data, %other)
-  %10 : int = prim::TensorToNum(%9)
-  %11 : int = prim::Constant[value=1]()
-  %12 : Long() = prim::NumToTensor(%11)
-  %alpha : float = prim::TensorToNum(%12)
-  %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha)
-  %mask : Dynamic = aten::mul(%a.1_mask, %b_mask)
-  %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %17 : Dynamic = aten::type_as(%a.1_mask, %9)
-  %cond_mask.1 : Dynamic = aten::mul(%9, %17)
-  %19 : int = aten::dim(%cond_mask.1)
-  %20 : int = prim::Constant[value=1]()
-  %21 : int = aten::eq(%19, %20)
-  %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%21)
-    block0() {
-      %25 : int = aten::dim(%data.1)
-      %26 : int = prim::Constant[value=1]()
-      %27 : int = aten::sub(%25, %26)
-      %28 : int = prim::Constant[value=1]()
-      %data.3 : Dynamic = prim::Loop(%27, %28, %cond_mask.1)
-        block0(%_ : int, %31 : Dynamic) {
-          %32 : int = aten::dim(%31)
-          %data.2 : Dynamic = aten::unsqueeze(%31, %32)
-          %34 : int = prim::Constant[value=1]()
-          -> (%34, %data.2)
-        }
-      %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
-      %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask)
-      -> (%cond_data.1, %cond_mask.2, %data.3)
-    }
-    block1() {
-      -> (%cond_mask.1, %cond_mask.1, %cond_mask.1)
-    }
-  %res_data : Dynamic = aten::where(%cond_data, %data.1, %a.1_data)
-  %res_mask : Dynamic = aten::where(%cond_mask, %mask, %a.1_mask)
-  %res_dims : Dynamic = aten::__or__(%dims, %a.1_dims)
-  return (%res_data, %res_mask, %res_dims);
-}
diff --git a/test/expect/TestBatched.test_while.expect b/test/expect/TestBatched.test_while.expect
deleted file mode 100644
index a32cd392044f00..00000000000000
--- a/test/expect/TestBatched.test_while.expect
+++ /dev/null
@@ -1,65 +0,0 @@
-graph(%a.1_data : Dynamic
-      %a.1_mask : Dynamic
-      %a.1_dims : Dynamic
-      %b_data : Dynamic
-      %b_mask : Dynamic
-      %b_dims : Dynamic) {
-  %6 : int = prim::Constant[value=2147483647]()
-  %7 : Dynamic = aten::gt(%a.1_data, %b_data)
-  %8 : Dynamic = aten::mul(%a.1_mask, %b_mask)
-  %9 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-  %10 : int = prim::TensorToNum(%7)
-  %11 : Dynamic = aten::mul(%7, %8)
-  %12 : Dynamic = aten::sum(%11)
-  %13 : int = prim::Constant[value=0]()
-  %14 : Dynamic = aten::gt(%12, %13)
-  %15 : int = prim::TensorToNum(%14)
-  %64 : Dynamic, %65 : Dynamic, %66 : Dynamic, %a : Dynamic, %62 : Dynamic, %63 : Dynamic = prim::Loop(%6, %15, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims)
-    block0(%loop_num : int, %cond_data.2 : Dynamic, %cond_mask.3 : Dynamic, %cond_dims : Dynamic, %6_data : Dynamic, %6_mask : Dynamic, %6_dims : Dynamic) {
-      %24 : int = prim::Constant[value=1]()
-      %25 : Long() = prim::NumToTensor(%24)
-      %alpha : float = prim::TensorToNum(%25)
-      %data.1 : Dynamic = aten::sub(%6_data, %b_data, %alpha)
-      %mask : Dynamic = aten::mul(%6_mask, %b_mask)
-      %dims : Dynamic = aten::__or__(%6_dims, %b_dims)
-      %30 : Dynamic = aten::gt(%data.1, %b_data)
-      %31 : Dynamic = aten::mul(%mask, %b_mask)
-      %32 : Dynamic = aten::__or__(%dims, %b_dims)
-      %33 : int = prim::TensorToNum(%30)
-      %34 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2)
-      %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %34)
-      %36 : int = aten::dim(%cond_mask.1)
-      %37 : int = prim::Constant[value=1]()
-      %38 : int = aten::eq(%36, %37)
-      %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%38)
-        block0() {
-          %42 : int = aten::dim(%data.1)
-          %43 : int = prim::Constant[value=1]()
-          %44 : int = aten::sub(%42, %43)
-          %45 : int = prim::Constant[value=1]()
-          %data.3 : Dynamic = prim::Loop(%44, %45, %cond_mask.1)
-            block0(%_ : int, %48 : Dynamic) {
-              %49 : int = aten::dim(%48)
-              %data.2 : Dynamic = aten::unsqueeze(%48, %49)
-              %51 : int = prim::Constant[value=1]()
-              -> (%51, %data.2)
-            }
-          %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
-          %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask)
-          -> (%cond_data.1, %cond_mask.2, %data.3)
-        }
-        block1() {
-          -> (%cond_mask.1, %cond_mask.1, %cond_mask.1)
-        }
-      %res_data : Dynamic = aten::where(%cond_data, %data.1, %6_data)
-      %res_mask : Dynamic = aten::where(%cond_mask, %mask, %6_mask)
-      %res_dims : Dynamic = aten::__or__(%dims, %6_dims)
-      %57 : Dynamic = aten::mul(%30, %31)
-      %58 : Dynamic = aten::sum(%57)
-      %59 : int = prim::Constant[value=0]()
-      %60 : Dynamic = aten::gt(%58, %59)
-      %61 : int = prim::TensorToNum(%60)
-      -> (%61, %30, %31, %32, %res_data, %res_mask, %res_dims)
-    }
-  return (%a, %62, %63);
-}
diff --git a/test/expect/TestJit.test_concat_fusion.expect b/test/expect/TestJit.test_concat_fusion.expect
index 454a84cba1db76..027c2de33e5926 100644
--- a/test/expect/TestJit.test_concat_fusion.expect
+++ b/test/expect/TestJit.test_concat_fusion.expect
@@ -3,11 +3,12 @@ graph(%0 : Float(3, 20)
   %2 : Float(6, 20) = prim::FusionGroup_0[device=0](%0, %1)
   return (%2);
 }
-with prim::FusionGroup_0 = graph(%3 : Float(3, 20)
-      %4 : Float(3, 20)) {
-  %6 : int = prim::Constant[value=1]()
-  %7 : Float(3, 20) = aten::add(%3, %4, %6)
-  %5 : Float(3, 20) = aten::mul(%3, %4)
-  %2 : Float(6, 20) = prim::FusedConcat[dim=0](%7, %5)
-  return (%2);
+with prim::FusionGroup_0 = graph(%4 : Float(3, 20)
+      %5 : Float(3, 20)) {
+  %7 : int = prim::Constant[value=1]()
+  %8 : Float(3, 20) = aten::add(%4, %5, %7)
+  %6 : Float(3, 20) = aten::mul(%4, %5)
+  %2 : int = prim::Constant[value=0]()
+  %3 : Float(6, 20) = aten::cat(%8, %6, %2)
+  return (%3);
 }
diff --git a/test/expect/TestJit.test_constant_prop_nested.expect b/test/expect/TestJit.test_constant_prop_nested.expect
deleted file mode 100644
index 09ef82076edc4a..00000000000000
--- a/test/expect/TestJit.test_constant_prop_nested.expect
+++ /dev/null
@@ -1,15 +0,0 @@
-graph(%a : Dynamic) {
-  %1 : int = prim::Constant[value=2]()
-  %2 : Dynamic = aten::lt(%a, %1)
-  %3 : int = prim::TensorToNum(%2)
-  %c : int = prim::If(%3)
-    block0() {
-      %5 : int = prim::Constant[value=5]()
-      -> (%5)
-    }
-    block1() {
-      %6 : int = prim::Constant[value=1]()
-      -> (%6)
-    }
-  return (%c);
-}
diff --git a/test/expect/TestJit.test_constant_prop_print.expect b/test/expect/TestJit.test_constant_prop_print.expect
deleted file mode 100644
index 7cadfdbbc6b3ea..00000000000000
--- a/test/expect/TestJit.test_constant_prop_print.expect
+++ /dev/null
@@ -1,12 +0,0 @@
-graph(%input_tensor : Dynamic) {
-  %1 : int = prim::Constant[value=6]()
-  %2 : Dynamic = ^FIXME_zerol()()
-  %a : Dynamic = aten::add(%1, %2)
-   = prim::Print(%a)
-  %4 : int = prim::Constant[value=2]()
-  %5 : int = prim::Constant[value=1]()
-  %b : Dynamic = aten::add(%a, %4, %5)
-  %7 : int = prim::Constant[value=1]()
-  %8 : Dynamic = aten::add(%b, %input_tensor, %7)
-  return (%8);
-}
diff --git a/test/expect/TestJit.test_constant_prop_rand.expect b/test/expect/TestJit.test_constant_prop_rand.expect
deleted file mode 100644
index a6c305258bff95..00000000000000
--- a/test/expect/TestJit.test_constant_prop_rand.expect
+++ /dev/null
@@ -1,11 +0,0 @@
-graph() {
-  %0 : int = prim::Constant[value=6]()
-  %1 : int = prim::Constant[value=0]()
-  %2 : int[] = prim::Constant[value=[0, -1]]()
-  %3 : int[] = prim::Constant[value=[3]]()
-  %a : Dynamic = aten::randn(%3, %0, %1, %2)
-  %5 : int = prim::Constant[value=2]()
-  %6 : int = prim::Constant[value=1]()
-  %b : Dynamic = aten::add(%a, %5, %6)
-  return (%b);
-}
diff --git a/test/expect/TestJit.test_constant_prop_simple.expect b/test/expect/TestJit.test_constant_prop_simple.expect
deleted file mode 100644
index 029f9ac05a0783..00000000000000
--- a/test/expect/TestJit.test_constant_prop_simple.expect
+++ /dev/null
@@ -1,5 +0,0 @@
-graph(%input_tensor : Dynamic) {
-  %1 : int = prim::Constant[value=8]()
-  %2 : Dynamic = aten::add(%1, %input_tensor)
-  return (%2);
-}
diff --git a/test/expect/TestJit.test_lstm_fusion_concat.expect b/test/expect/TestJit.test_lstm_fusion_concat.expect
index f0771c133c11d9..7884a95c48c9a1 100644
--- a/test/expect/TestJit.test_lstm_fusion_concat.expect
+++ b/test/expect/TestJit.test_lstm_fusion_concat.expect
@@ -16,33 +16,34 @@ graph(%0 : Float(3, 10)
   %21 : Float(6, 20) = prim::FusionGroup_0[device=0](%2, %16, %20, %15, %19, %14, %18, %13, %17)
   return (%21);
 }
-with prim::FusionGroup_0 = graph(%15 : Float(3, 20)
-      %25 : Float(3!, 20)
+with prim::FusionGroup_0 = graph(%16 : Float(3, 20)
       %26 : Float(3!, 20)
-      %29 : Float(3!, 20)
+      %27 : Float(3!, 20)
       %30 : Float(3!, 20)
-      %33 : Float(3!, 20)
+      %31 : Float(3!, 20)
       %34 : Float(3!, 20)
-      %37 : Float(3!, 20)
-      %38 : Float(3!, 20)) {
-  %39 : int = prim::Constant[value=1]()
-  %40 : Float(3, 20) = aten::add(%37, %38, %39)
-  %35 : int = prim::Constant[value=1]()
-  %36 : Float(3, 20) = aten::add(%33, %34, %35)
-  %31 : int = prim::Constant[value=1]()
-  %32 : Float(3, 20) = aten::add(%29, %30, %31)
-  %27 : int = prim::Constant[value=1]()
-  %28 : Float(3, 20) = aten::add(%25, %26, %27)
-  %24 : Float(3, 20) = aten::sigmoid(%40)
-  %22 : Float(3, 20) = aten::sigmoid(%36)
-  %20 : Float(3, 20) = aten::tanh(%32)
-  %18 : Float(3, 20) = aten::sigmoid(%28)
-  %16 : Float(3, 20) = aten::mul(%22, %15)
-  %13 : Float(3, 20) = aten::mul(%24, %20)
-  %9 : int = prim::Constant[value=1]()
-  %10 : Float(3, 20) = aten::add(%16, %13, %9)
-  %6 : Float(3, 20) = aten::tanh(%10)
-  %5 : Float(3, 20) = aten::mul(%18, %6)
-  %2 : Float(6, 20) = prim::FusedConcat[dim=0](%5, %10)
-  return (%2);
+      %35 : Float(3!, 20)
+      %38 : Float(3!, 20)
+      %39 : Float(3!, 20)) {
+  %40 : int = prim::Constant[value=1]()
+  %41 : Float(3, 20) = aten::add(%38, %39, %40)
+  %36 : int = prim::Constant[value=1]()
+  %37 : Float(3, 20) = aten::add(%34, %35, %36)
+  %32 : int = prim::Constant[value=1]()
+  %33 : Float(3, 20) = aten::add(%30, %31, %32)
+  %28 : int = prim::Constant[value=1]()
+  %29 : Float(3, 20) = aten::add(%26, %27, %28)
+  %25 : Float(3, 20) = aten::sigmoid(%41)
+  %23 : Float(3, 20) = aten::sigmoid(%37)
+  %21 : Float(3, 20) = aten::tanh(%33)
+  %19 : Float(3, 20) = aten::sigmoid(%29)
+  %17 : Float(3, 20) = aten::mul(%23, %16)
+  %14 : Float(3, 20) = aten::mul(%25, %21)
+  %10 : int = prim::Constant[value=1]()
+  %11 : Float(3, 20) = aten::add(%17, %14, %10)
+  %7 : Float(3, 20) = aten::tanh(%11)
+  %6 : Float(3, 20) = aten::mul(%19, %7)
+  %2 : int = prim::Constant[value=0]()
+  %3 : Float(6, 20) = aten::cat(%6, %11, %2)
+  return (%3);
 }
diff --git a/test/expect/TestScript.test_cat_lifts.expect b/test/expect/TestScript.test_cat_lifts.expect
index c8c82e5199c030..ea2fa3737c0556 100644
--- a/test/expect/TestScript.test_cat_lifts.expect
+++ b/test/expect/TestScript.test_cat_lifts.expect
@@ -1,18 +1,15 @@
 graph(%x : Dynamic) {
   %1 : int = prim::Constant[value=1]()
-  %2 : Dynamic[] = prim::ListConstruct(%x, %x)
-  %3 : Dynamic = aten::cat(%2, %1)
-  return (%3);
+  %2 : Dynamic = aten::cat(%x, %x, %1)
+  return (%2);
 }
 graph(%x : Dynamic) {
   %1 : int = prim::Constant[value=1]()
-  %2 : Dynamic[] = prim::ListConstruct()
-  %3 : Dynamic = aten::cat(%2, %1)
-  return (%3);
+  %2 : Dynamic = aten::cat(%1)
+  return (%2);
 }
 graph(%x : Dynamic) {
   %1 : int = prim::Constant[value=1]()
-  %2 : Dynamic[] = prim::ListConstruct(%x)
-  %3 : Dynamic = aten::cat(%2, %1)
-  return (%3);
+  %2 : Dynamic = aten::cat(%x, %1)
+  return (%2);
 }
diff --git a/test/expect/TestScript.test_index_put_trace_with_view.expect b/test/expect/TestScript.test_index_put_trace_with_view.expect
index 37f08643f139a4..591e499da96671 100644
--- a/test/expect/TestScript.test_index_put_trace_with_view.expect
+++ b/test/expect/TestScript.test_index_put_trace_with_view.expect
@@ -6,7 +6,6 @@ graph(%0 : Double(100)
   %5 : Double(4) = aten::view(%2, %4)
   %6 : int = prim::Constant[value=0]()
   %7 : Long(4) = aten::_cast_Long(%1, %6)
-  %8 : Dynamic[] = prim::ListConstruct(%7)
-  %20 : Double(100) = aten::index_put(%0, %8, %5)
-  return (%20);
+  %19 : Double(100) = aten::index_put(%0, %7, %5)
+  return (%19);
 }
diff --git a/test/expect/TestScript.test_index_put_trace_without_view.expect b/test/expect/TestScript.test_index_put_trace_without_view.expect
index 772308223b454b..42f8e49142942e 100644
--- a/test/expect/TestScript.test_index_put_trace_without_view.expect
+++ b/test/expect/TestScript.test_index_put_trace_without_view.expect
@@ -3,7 +3,6 @@ graph(%0 : Double(100)
       %2 : Double(4)) {
   %3 : int = prim::Constant[value=0]()
   %4 : Long(4) = aten::_cast_Long(%1, %3)
-  %5 : Dynamic[] = prim::ListConstruct(%4)
-  %17 : Double(100) = aten::index_put(%0, %5, %2)
-  return (%17);
+  %16 : Double(100) = aten::index_put(%0, %4, %2)
+  return (%16);
 }
diff --git a/test/onnx/expect/TestOperators.test_elu.expect b/test/onnx/expect/TestOperators.test_elu.expect
deleted file mode 100644
index a8eff9ab2c1387..00000000000000
--- a/test/onnx/expect/TestOperators.test_elu.expect
+++ /dev/null
@@ -1,63 +0,0 @@
-ir_version: 3
-producer_name: "pytorch"
-producer_version: "0.3"
-graph {
-  node {
-    input: "0"
-    output: "1"
-    op_type: "Elu"
-    attribute {
-      name: "alpha"
-      f: 1
-      type: FLOAT
-    }
-  }
-  name: "torch-jit-export"
-  input {
-    name: "0"
-    type {
-      tensor_type {
-        elem_type: FLOAT
-        shape {
-          dim {
-            dim_value: 1
-          }
-          dim {
-            dim_value: 2
-          }
-          dim {
-            dim_value: 3
-          }
-          dim {
-            dim_value: 4
-          }
-        }
-      }
-    }
-  }
-  output {
-    name: "1"
-    type {
-      tensor_type {
-        elem_type: FLOAT
-        shape {
-          dim {
-            dim_value: 1
-          }
-          dim {
-            dim_value: 2
-          }
-          dim {
-            dim_value: 3
-          }
-          dim {
-            dim_value: 4
-          }
-        }
-      }
-    }
-  }
-}
-opset_import {
-  version: 7
-}
diff --git a/test/onnx/expect/TestOperators.test_equal.expect b/test/onnx/expect/TestOperators.test_equal.expect
index fc23156d1cbf47..3d8210b14bcbee 100644
--- a/test/onnx/expect/TestOperators.test_equal.expect
+++ b/test/onnx/expect/TestOperators.test_equal.expect
@@ -45,7 +45,7 @@ graph {
     name: "2"
     type {
       tensor_type {
-        elem_type: UINT8
+        elem_type: INT8
         shape {
           dim {
             dim_value: 3
diff --git a/test/onnx/expect/TestOperators.test_ge.expect b/test/onnx/expect/TestOperators.test_ge.expect
index 204a59e88ef5a6..e50f2e12537d56 100644
--- a/test/onnx/expect/TestOperators.test_ge.expect
+++ b/test/onnx/expect/TestOperators.test_ge.expect
@@ -50,7 +50,7 @@ graph {
     name: "3"
     type {
       tensor_type {
-        elem_type: UINT8
+        elem_type: INT8
         shape {
           dim {
             dim_value: 3
diff --git a/test/onnx/expect/TestOperators.test_gt.expect b/test/onnx/expect/TestOperators.test_gt.expect
index d3eb9cf08c30a6..3cda8f244819b7 100644
--- a/test/onnx/expect/TestOperators.test_gt.expect
+++ b/test/onnx/expect/TestOperators.test_gt.expect
@@ -45,7 +45,7 @@ graph {
     name: "2"
     type {
       tensor_type {
-        elem_type: UINT8
+        elem_type: INT8
         shape {
           dim {
             dim_value: 3
diff --git a/test/onnx/expect/TestOperators.test_le.expect b/test/onnx/expect/TestOperators.test_le.expect
index 39ba6940e2289c..2aefbc6dbc8622 100644
--- a/test/onnx/expect/TestOperators.test_le.expect
+++ b/test/onnx/expect/TestOperators.test_le.expect
@@ -50,7 +50,7 @@ graph {
     name: "3"
     type {
       tensor_type {
-        elem_type: UINT8
+        elem_type: INT8
         shape {
           dim {
             dim_value: 3
diff --git a/test/onnx/expect/TestOperators.test_lt.expect b/test/onnx/expect/TestOperators.test_lt.expect
index cd9c4eaaaf50a7..83656cb3a5ce04 100644
--- a/test/onnx/expect/TestOperators.test_lt.expect
+++ b/test/onnx/expect/TestOperators.test_lt.expect
@@ -45,7 +45,7 @@ graph {
     name: "2"
     type {
       tensor_type {
-        elem_type: UINT8
+        elem_type: INT8
         shape {
           dim {
             dim_value: 3
diff --git a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
index 3c1321664dd3fd..b1ff53c2e4e7d8 100644
--- a/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
+++ b/test/onnx/expect/TestOperators.test_repeat_dim_overflow.expect
@@ -10,33 +10,33 @@ graph {
       t {
         dims: 4
         data_type: INT64
-        raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000"
+        raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000"
       }
       type: TENSOR
     }
   }
   node {
+    input: "0"
+    input: "1"
     output: "2"
+    op_type: "Reshape"
+  }
+  node {
+    output: "3"
     op_type: "Constant"
     attribute {
       name: "value"
       t {
         dims: 4
         data_type: INT64
-        raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000"
+        raw_data: "\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\003\000\000\000\000\000\000\000\004\000\000\000\000\000\000\000"
       }
       type: TENSOR
     }
   }
   node {
-    input: "0"
     input: "2"
-    output: "3"
-    op_type: "Reshape"
-  }
-  node {
     input: "3"
-    input: "1"
     output: "4"
     op_type: "Tile"
   }
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index ba8292e616686a..1e2c401dcc3ac0 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -364,10 +364,6 @@ def test_pow(self):
         y = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
         self.assertONNX(lambda x, y: x.pow(y), (x, y))
 
-    def test_elu(self):
-        x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
-        self.assertONNX(nn.ELU(), x)
-
     def test_selu(self):
         x = Variable(torch.randn(1, 2, 3, 4), requires_grad=True)
         self.assertONNX(nn.SELU(), x)
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index 7130a7695cc69b..85ef2eac5bf2ce 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -676,52 +676,6 @@ def forward(self, x):
             x = Variable(torch.randn(*shape))
             self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
 
-    def test_cumsum(self):
-        shape = (3, 4, 5)
-        for params in [{'dim': i} for i in range(len(shape))]:
-            class MyModel(torch.nn.Module):
-                def __init__(self):
-                    super(MyModel, self).__init__()
-
-                def forward(self, x):
-                    return torch.cumsum(x, **params)
-            x = Variable(torch.randn(*shape))
-            self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
-
-    def test_repeat(self):
-        class MyModel(torch.nn.Module):
-            def __init__(self):
-                super(MyModel, self).__init__()
-
-            def forward(self, x):
-                return x.repeat(1, 2, 3, 4)
-
-        x = Variable(torch.randn(4, 3, 2, 1), requires_grad=True)
-        self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
-
-    def test_repeat_dim_overflow(self):
-        class MyModel(torch.nn.Module):
-            def __init__(self):
-                super(MyModel, self).__init__()
-
-            def forward(self, x):
-                return x.repeat(1, 2, 3, 4)
-
-        x = Variable(torch.randn(1, 2), requires_grad=True)
-        self.run_model_test(MyModel(), train=False, input=(x), batch_size=BATCH_SIZE, use_gpu=False)
-
-    def test_repeat_dynamic(self):
-        class MyModel(torch.nn.Module):
-            def __init__(self):
-                super(MyModel, self).__init__()
-
-            def forward(self, x, y):
-                return x.repeat(y.size()[0] / 2, y.size()[1] * 2)
-
-        x = Variable(torch.randn(1, 2), requires_grad=True)
-        y = Variable(torch.randn(2, 4), requires_grad=True)
-        self.run_model_test(MyModel(), train=False, input=(x, y), batch_size=BATCH_SIZE, use_gpu=False)
-
     def test_mean(self):
         shape = (3, 4, 5)
         for params in [{}] + [{'dim': i} for i in range(len(shape))]:
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 9d39043db9b56d..3ef7c21d49fc90 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -15,7 +15,7 @@
 from torch.autograd.function import once_differentiable
 from torch.autograd.profiler import profile
 from common import TEST_MKL, TestCase, run_tests, skipIfNoLapack, \
-    suppress_warnings, TEST_WITH_ROCM
+    suppress_warnings, skipIfNoZeroSize, TEST_WITH_ROCM
 from torch.autograd import Variable, Function, detect_anomaly
 from torch.autograd.function import InplaceFunction
 from torch.testing import make_non_contiguous, randn_like
@@ -1851,16 +1851,6 @@ def backward(ctx, grad_output):
         out.sum().backward()
         self.assertEqual(x.grad.data, y_data)
 
-    def test_broadcast_tensors(self):
-        f_args_variable = (torch.randn(3, requires_grad=True),
-                           torch.randn(1, 2, 1, requires_grad=True),
-                           torch.randn(1, 1, requires_grad=True),
-                           torch.randn(5, 1, 1, requires_grad=True))
-        f_args_tensor = deepcopy(unpack_variables(f_args_variable))
-        run_functional_checks(self, "test_broadcast_tensors", "broadcast",
-                              lambda a, b, c, d: torch.broadcast_tensors(a, b, c, d),
-                              True, f_args_variable, f_args_tensor)
-
     def test_cat(self):
         f_args_variable = (torch.randn(1, S, S, requires_grad=True),
                            torch.randn(2, S, S, requires_grad=True),
@@ -1902,6 +1892,7 @@ def test_cat_empty_legacy(self):
                               False, f_args_variable, f_args_tensor)
         self.assertTrue(gradcheck(lambda a, b: torch.cat((a, b)), f_args_variable, eps=1e-6, atol=PRECISION))
 
+    @skipIfNoZeroSize
     def test_cat_empty(self):
         f_args_variable = (torch.randn(0, S, requires_grad=True),
                            torch.randn(S, S, requires_grad=True))
@@ -1910,6 +1901,7 @@ def test_cat_empty(self):
                               lambda a, b: torch.cat((a, b)),
                               True, f_args_variable, f_args_tensor)
 
+    @skipIfNoLapack
     def test_potrf(self):
         root = Variable(torch.tril(torch.rand(S, S)), requires_grad=True)
 
@@ -3131,7 +3123,7 @@ class dont_convert(tuple):
     ('select', (S, S, S), (1, -1), 'wrap_dim', [0]),
     ('select', (S,), (0, 2), '1d'),
     ('narrow', (S, S, S), (1, 2, 2), 'dim', [0]),
-    ('narrow', (S, S, S), (1, 0, 0), 'empty_dim', [0]),
+    ('narrow', (S, S, S), (1, 0, 0), 'empty_dim', [0], [skipIfNoZeroSize]),
     ('squeeze', (S, 1, S, 1), NO_ARGS),
     ('squeeze', (1, 1, 1, 1), NO_ARGS, 'input_sizes_are_ones'),
     ('squeeze', (S, 1, S, 1), (1,), '1_dim', [0]),
diff --git a/test/test_distributions.py b/test/test_distributions.py
index 8a607ece6931c5..7effb9012e9fc6 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -42,8 +42,8 @@
                                  Independent, Laplace, LogisticNormal,
                                  LogNormal, LowRankMultivariateNormal,
                                  Multinomial, MultivariateNormal,
-                                 NegativeBinomial, Normal, OneHotCategorical, Pareto,
-                                 Poisson, RelaxedBernoulli, RelaxedOneHotCategorical,
+                                 Normal, OneHotCategorical, Pareto, Poisson,
+                                 RelaxedBernoulli, RelaxedOneHotCategorical,
                                  StudentT, TransformedDistribution, Uniform,
                                  Weibull, constraints, kl_divergence)
 from torch.distributions.constraint_registry import biject_to, transform_to
@@ -123,16 +123,6 @@ def is_all_nan(tensor):
         {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True),
          'total_count': torch.tensor(0.)},
     ]),
-    Example(NegativeBinomial, [
-        {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), 'total_count': 10},
-        {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': 10},
-        {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': torch.tensor([10])},
-        {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True), 'total_count': torch.tensor([10, 8])},
-        {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True),
-         'total_count': torch.tensor([[10., 8.], [5., 3.]])},
-        {'probs': torch.tensor([[0.9, 0.0], [0.0, 0.9]], requires_grad=True),
-         'total_count': torch.tensor(0.)},
-    ]),
     Example(Multinomial, [
         {'probs': torch.tensor([[0.1, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True), 'total_count': 10},
         {'probs': torch.tensor([[1.0, 0.0], [0.0, 1.0]], requires_grad=True), 'total_count': 10},
@@ -452,12 +442,6 @@ def is_all_nan(tensor):
         {'probs': torch.tensor([[1.0, 0.0], [0.0, 2.0]], requires_grad=True),
          'total_count': 10},
     ]),
-    Example(NegativeBinomial, [
-        {'probs': torch.tensor([[-0.0000001, 0.2, 0.3], [0.5, 0.3, 0.2]], requires_grad=True),
-         'total_count': 10},
-        {'probs': torch.tensor([[1.0, 0.0], [0.0, 2.0]], requires_grad=True),
-         'total_count': 10},
-    ]),
     Example(Cauchy, [
         {'loc': 0.0, 'scale': -1.0},
         {'loc': torch.tensor([0.0]), 'scale': 0.0},
@@ -927,37 +911,6 @@ def test_binomial_enumerate_support(self):
         bin1 = Binomial(torch.tensor(5), torch.tensor(0.5))
         self.assertEqual(bin1.enumerate_support(), torch.arange(6))
 
-    def test_negative_binomial(self):
-        p = torch.tensor(torch.arange(0.05, 1, 0.1), requires_grad=True)
-        for total_count in [1, 2, 10]:
-            self._gradcheck_log_prob(lambda p: NegativeBinomial(total_count, p), [p])
-            self._gradcheck_log_prob(lambda p: NegativeBinomial(total_count, None, p.log()), [p])
-        self.assertRaises(NotImplementedError, NegativeBinomial(10, p).rsample)
-        self.assertRaises(NotImplementedError, NegativeBinomial(10, p).entropy)
-
-    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
-    def test_negative_binomial_log_prob(self):
-        probs = torch.tensor(torch.arange(0.05, 1, 0.1))
-        for total_count in [1, 2, 10]:
-
-            def ref_log_prob(idx, x, log_prob):
-                p = probs.view(-1)[idx].item()
-                expected = scipy.stats.nbinom(total_count, 1 - p).logpmf(x)
-                self.assertAlmostEqual(log_prob, expected, places=3)
-
-            self._check_log_prob(NegativeBinomial(total_count, probs), ref_log_prob)
-            logits = probs_to_logits(probs, is_binary=True)
-            self._check_log_prob(NegativeBinomial(total_count, logits=logits), ref_log_prob)
-
-    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
-    def test_negative_binomial_log_prob_vectorized_count(self):
-        probs = torch.tensor([0.2, 0.7, 0.9])
-        for total_count, sample in [(torch.tensor([10]), torch.tensor([7., 3., 9.])),
-                                    (torch.tensor([1, 2, 10]), torch.tensor([0., 1., 9.]))]:
-            log_prob = NegativeBinomial(total_count, probs).log_prob(sample)
-            expected = scipy.stats.nbinom(total_count.cpu().numpy(), 1 - probs.cpu().numpy()).logpmf(sample)
-            self.assertAlmostEqual(log_prob, expected, places=4)
-
     def test_multinomial_1d(self):
         total_count = 10
         p = torch.tensor([0.1, 0.2, 0.3], requires_grad=True)
@@ -3522,7 +3475,7 @@ def setUp(self):
             ),
             (
                 Binomial(10, simplex_tensor),
-                scipy.stats.binom(10 * np.ones(simplex_tensor.shape), simplex_tensor.numpy())
+                scipy.stats.binom(10 * np.ones(simplex_tensor.shape), simplex_tensor)
             ),
             (
                 Cauchy(random_var, positive_var),
@@ -3909,9 +3862,6 @@ def get_constraints(self, is_cuda=False):
             constraints.greater_than(0),
             constraints.greater_than(2),
             constraints.greater_than(-2),
-            constraints.greater_than_eq(0),
-            constraints.greater_than_eq(2),
-            constraints.greater_than_eq(-2),
             constraints.less_than(tensor([-10., -2, 0, 2, 10])),
             constraints.less_than(0),
             constraints.less_than(2),
@@ -3921,10 +3871,6 @@ def get_constraints(self, is_cuda=False):
                                  tensor([-3., 3, 1, 5, 5])),
             constraints.interval(-2, -1),
             constraints.interval(1, 2),
-            constraints.half_open_interval(tensor([-4., -2, 0, 2, 4]),
-                                           tensor([-3., 3, 1, 5, 5])),
-            constraints.half_open_interval(-2, -1),
-            constraints.half_open_interval(1, 2),
             constraints.simplex,
             constraints.lower_cholesky,
         ]
diff --git a/test/test_indexing.py b/test/test_indexing.py
index afe9e6d60c653c..00865d9f576b74 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -1,4 +1,4 @@
-from common import TestCase, run_tests
+from common import TestCase, run_tests, skipIfNoZeroSize
 import torch
 import warnings
 from torch import tensor
@@ -93,6 +93,7 @@ def test_empty_index(self):
         y[mask] = -1
         self.assertEqual(x, y)
 
+    @skipIfNoZeroSize
     def test_empty_ndim_index(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -103,12 +104,14 @@ def test_empty_ndim_index(self):
             self.assertEqual(torch.empty(2, 0, 6, 4, 5, device=device),
                              x[:, torch.empty(0, 6, dtype=torch.int64, device=device)])
 
+    @skipIfNoZeroSize
     def test_empty_ndim_index_bool(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
             x = torch.randn(5, device=device)
             self.assertRaises(IndexError, lambda: x[torch.empty(0, 2, dtype=torch.uint8, device=device)])
 
+    @skipIfNoZeroSize
     def test_empty_slice(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -472,18 +475,26 @@ def test_boolean_indexing_twodim(self):
     def test_boolean_indexing_weirdness(self):
         # Weird boolean indexing things
         a = torch.ones((2, 3, 4))
-        self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape)
+        if torch._C._use_zero_size_dim():
+            self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape)
+        else:
+            self.assertEqual((0,), a[False, True, ...].shape)
         self.assertEqual(torch.ones(1, 2), a[True, [0, 1], True, True, [1], [[2]]])
-        self.assertRaises(RuntimeError, lambda: a[False, [0, 1], ...])
+        if torch._C._use_zero_size_dim():
+            self.assertRaises(RuntimeError, lambda: a[False, [0, 1], ...])
 
     def test_boolean_indexing_weirdness_tensors(self):
         # Weird boolean indexing things
         false = torch.tensor(False)
         true = torch.tensor(True)
         a = torch.ones((2, 3, 4))
-        self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape)
+        if torch._C._use_zero_size_dim():
+            self.assertEqual((0, 2, 3, 4), a[False, True, ...].shape)
+        else:
+            self.assertEqual((0,), a[False, True, ...].shape)
         self.assertEqual(torch.ones(1, 2), a[true, [0, 1], true, true, [1], [[2]]])
-        self.assertRaises(RuntimeError, lambda: a[false, [0, 1], ...])
+        if torch._C._use_zero_size_dim():
+            self.assertRaises(RuntimeError, lambda: a[false, [0, 1], ...])
 
     def test_boolean_indexing_alldims(self):
         true = torch.tensor(True)
diff --git a/test/test_jit.py b/test/test_jit.py
index b3bbe9892bc7db..ab4c907e72d19f 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1122,95 +1122,13 @@ def test_fn(ten, mask):
         ten = torch.rand(3, 3)
         self.assertEqual(test_fn(ten, mask), traced_test_fn(ten, mask))
 
-    def test_constant_prop_simple(self):
-        @torch.jit.script
-        def constant_prop(input_tensor):
-            a = 2 * 3
-            b = a + 2
-            return b + input_tensor
-
-        x = torch.tensor(2)
-        out_ref = constant_prop(x)
-        self.run_pass('constant_propagation', constant_prop.graph)
-        out_test = constant_prop(torch.tensor(2))
-        self.assertEqual(out_ref, out_test)
-        self.assertExpected(canonical(constant_prop.graph))
-
-    def test_constant_prop_nested(self):
-        @torch.jit.script
-        def constant_prop(a):
-            b = 2 + 1
-            if a < 2:
-                c = b + 2
-            else:
-                c = b - 2
-            return c
-
-        out_ref = constant_prop(torch.tensor(2))
-        self.run_pass('constant_propagation', constant_prop.graph)
-        out_test = constant_prop(torch.tensor(2))
-        self.assertEqual(out_ref, out_test)
-        self.assertExpected(canonical(constant_prop.graph))
-
-    def test_constant_prop_print(self):
-        @torch.jit.script
-        def constant_prop(input_tensor):
-            a = 2 * 3 + FIXME_zerol()
-            print(a)
-            b = a + 2
-            return b + input_tensor
-
-        self.run_pass('constant_propagation', constant_prop.graph)
-        self.assertExpected(canonical(constant_prop.graph))
-
-    def test_constant_prop_rand(self):
-        @torch.jit.script
-        def constant_prop():
-            a = torch.randn([3])
-            b = a + 2
-            return b
-
-        self.run_pass('constant_propagation', constant_prop.graph)
-        self.assertExpected(canonical(constant_prop.graph))
-
-    # TODO: implement
-    @unittest.expectedFailure
-    def test_constant_prop_if_constant(self):
-        @torch.jit.script
-        def constant_prop():
-            b = 3
-            if True:
-                b = 1
-            if False:
-                b = 2
-            return b
-
-        self.run_pass('constant_propagation', constant_prop.graph)
-        self.assertExpected(canonical(constant_prop.graph))
-
-    # TODO: implement
-    @unittest.expectedFailure
-    def test_constant_prop_loop_constant(self):
-        @torch.jit.script
-        def constant_prop():
-            b = 0
-            while True:
-                b = 1
-            while False:
-                b = 2
-            return b
-
-        self.run_pass('constant_propagation', constant_prop.graph)
-        self.assertExpected(canonical(constant_prop.graph))
-
 
 class TestBatched(TestCase):
     # generate random examples and create an batchtensor with them
     def rand_batch(self, *dims):
         dims = [dim for dim in dims if dim != ()]
-        xs = [torch.rand(1, *(random.randint(1, size) if b else size for b, size in dims[1:]),
-                         requires_grad=True) for i in range(dims[0])]
-        xb = BatchTensor(xs, torch.tensor([b for b, d in dims[1:]]).byte())
+        xs = [torch.rand(1, *(random.randint(1, size) if b else size for b, size in dims[1:])) for i in range(dims[0])]
+        xb = BatchTensor(xs, torch.tensor([b for b, d in dims[1:]]))
         return xs, xb
 
     def test_create_batchtensor(self):
@@ -1238,20 +1156,20 @@ def tanh(a):
 
     def test_batch_elementwise_binary(self):
         @torch.jit.batch(batch_size=4)
-        def add(a, b):
-            return a + b
+        def mul(a, b):
+            return a * b
 
         xs, batch = self.rand_batch(4, (True, 3), (False, 2))
         xs2, batch2 = xs, batch
-        res_batch = add(batch, batch2)
-        res = [torch.add(xs[j], xs2[j]) for j in range(4)]
+        res_batch = mul(batch, batch2)
+        res = [torch.mul(xs[j], xs2[j]) for j in range(4)]
         self.assertEqual(res, res_batch.examples())
 
         # test broadcast
         xs, batch = self.rand_batch(4, (False, 3), (False, 2))
         b = torch.rand(3, 2)
-        res_batch = add(batch, b)
-        res = [torch.add(xs[j], b) for j in range(4)]
+        res_batch = mul(batch, b)
+        res = [torch.mul(xs[j], b) for j in range(4)]
         self.assertEqual(res, res_batch.examples())
 
     def test_batch_mm(self):
@@ -1298,33 +1216,6 @@ def matmul_test(xs, batch, xs2, batch2):
         xs2, batch2 = self.rand_batch(4, (False, 2), (True, 3))
         matmul_test(xs, batch, xs2, batch2)
 
-    def test_batch_select(self):
-        @torch.jit.batch(batch_size=4)
-        def select(x):
-            return torch.select(x, 1, 0)
-
-        xs, batch = self.rand_batch(4, (True, 3), (True, 2))
-        res_batch = select(batch)
-        res = [torch.select(xs[j], 1, 0) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
-        xs, batch = self.rand_batch(4, (False, 3), (True, 2))
-        res_batch = select(batch)
-        res = [torch.select(xs[j], 1, 0) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
-    def test_batch_index_select(self):
-        @torch.jit.batch(batch_size=4)
-        def index_select(x, ind):
-            return x.index_select(1, ind)
-
-        xs, batch = self.rand_batch(4, (False, 5), (True, 2))
-        ind = [torch.randint(0, 4, (1,), dtype=torch.long) for i in range(4)]
-        ind_batch = BatchTensor(ind, torch.tensor([]).byte())
-        res_batch = index_select(batch, ind_batch)
-        res = [torch.index_select(xs[j], 1, ind[j]) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
     def test_batch_where(self):
         @torch.jit.batch(batch_size=4)
         def where(c, a, b):
@@ -1341,300 +1232,43 @@ def where(c, a, b):
         res = [torch.where(xs_cond[j], xs[j], xs2[j]) for j in range(4)]
         self.assertEqual(res, res_batch.examples())
 
-    def test_batch_argmax(self):
-        @torch.jit.batch(batch_size=4)
-        def argmax(a):
-            return torch.argmax(a, 1)
-
-        xs, batch = self.rand_batch(4, (True, 5), (True, 6))
-        res_batch = argmax(batch)
-        res = [torch.argmax(xs[j], 1) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
-        @torch.jit.batch(batch_size=4)
-        def argmax(a):
-            return torch.argmax(a, 1, False)
-
-        res_batch = argmax(batch)
-        res = [torch.argmax(xs[j], 1, False) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
-    def test_batch_topk(self):
-        @torch.jit.batch(batch_size=4)
-        def topk(a):
-            return torch.topk(a, 3, 1)
-
-        xs, batch = self.rand_batch(4, (False, 5), (True, 6))
-
-        # along static dim
-        res_batch = topk(batch)
-        res = [torch.topk(xs[j], 3, 1)[0] for j in range(4)]
-        res_idx = [torch.topk(xs[j], 3, 1)[1] for j in range(4)]
-        self.assertEqual(res, res_batch[0].examples())
-        self.assertEqual(res_idx, res_batch[1].examples())
-
-        @torch.jit.batch(batch_size=4)
-        def topk(a):
-            return torch.topk(a, 1, 2)
-
-        # along dynamic dim
-        res_batch = topk(batch)
-        res = [torch.topk(xs[j], 1, 2)[0] for j in range(4)]
-        res_idx = [torch.topk(xs[j], 1, 2)[1] for j in range(4)]
-        self.assertEqual(res, res_batch[0].examples())
-        self.assertEqual(res_idx, res_batch[1].examples())
-
-    def test_batch_softmax(self):
-        @torch.jit.batch(batch_size=4)
-        def softmax(a):
-            return torch.softmax(a, 1)
-
-        xs, batch = self.rand_batch(4, (False, 5), (True, 6))
-
-        # along static dim
-        res_batch = softmax(batch)
-        res = [torch.softmax(xs[j], 1) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
-        @torch.jit.batch(batch_size=4)
-        def softmax(a):
-            return torch.softmax(a, 2)
-
-        # along dynamic dim
-        res_batch = softmax(batch)
-        res = [torch.softmax(xs[j], 2) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
-    def test_batch_view(self):
-        @torch.jit.batch(batch_size=4)
-        def view(a):
-            return a.view([4, -1, 3])
-
-        xs, batch = self.rand_batch(4, (True, 5), (False, 3))
-        res_batch = view(batch)
-        res = [xs[j].view([1, -1, 3]) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
-    def test_batch_cat(self):
-        @torch.jit.batch(batch_size=4)
-        def cat2(a, b):
-            return torch.cat([a, b], 2)
-
-        xs, batch = self.rand_batch(4, (True, 5), (False, 3))
-        xs2, batch2 = xs, batch
-        res_batch = cat2(batch, batch2)
-        res = [torch.cat([xs[j], xs2[j]], 2) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
+    @unittest.skip("Need support for scalar arguments")
+    def test_lstm_cell(self):
+        def LSTMCell(x, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c):
+            i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
+            f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f
+            o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o
+            # activations
+            i_t = torch.sigmoid(i_t)
+            f_t = torch.sigmoid(f_t)
+            o_t = torch.sigmoid(o_t)
+            # cell computations
+            c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c
+            c_t = torch.tanh(c_t)
+            c_t = torch.mul(c, f_t) + torch.mul(i_t, c_t)
+            h_t = torch.mul(o_t, torch.tanh(c_t))
+            return h_t
 
-    def test_batch_sum(self):
         @torch.jit.batch(batch_size=4)
-        def batch_sum(a):
-            return a.sum()
-
-        xs, batch = self.rand_batch(4, (True, 5), (False, 3))
-        res_batch = batch_sum(batch)
-        res = [xs[j].sum().unsqueeze(0) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
-    def test_if_else(self):
-        def single_if(a, b):
-            if a > b:
-                a = a + b
-            else:
-                a = a - b
-            return a
-
-        batch_if = torch.jit.batch(batch_size=4)(single_if)
-
-        a, batch_a = self.rand_batch(4, ())
-        b, batch_b = self.rand_batch(4, ())
-        res_batch = batch_if(batch_a, batch_b)
-        res = [single_if(a[j], b[j]) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
-        script_if = torch.jit.script(single_if)
-        graph = torch.to_batch_graph(script_if.graph)
-        self.assertExpected(str(graph))
-
-    def test_if_else_with_scalar(self):
-        def single_if(a, b):
-            if a > 0.1:
-                a = a + b
-            else:
-                a = a - b
-            return a
-
-        batch_if = torch.jit.batch(batch_size=4)(single_if)
-
-        a, batch_a = self.rand_batch(4, ())
-        b, batch_b = self.rand_batch(4, ())
-        res_batch = batch_if(batch_a, batch_b)
-        res = [single_if(a[j], b[j]) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
-        script_if = torch.jit.script(single_if)
-        graph = torch.to_batch_graph(script_if.graph)
-        self.assertExpected(str(graph))
-
-    def test_if_noelse(self):
-        def single_if(a, b):
-            if a > b:
-                a = a + b
-            return a
-
-        batch_if = torch.jit.batch(batch_size=4)(single_if)
-
-        a, batch_a = self.rand_batch(4, ())
-        b, batch_b = self.rand_batch(4, ())
-        res_batch = batch_if(batch_a, batch_b)
-        res = [single_if(a[j], b[j]) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
-        script_if = torch.jit.script(single_if)
-        graph = torch.to_batch_graph(script_if.graph)
-        self.assertExpected(str(graph))
-
-    def test_if_noelse_with_scalar(self):
-        def single_if(a, b):
-            if a > 0.1:
-                a = a + b
-            return a
-
-        batch_if = torch.jit.batch(batch_size=4)(single_if)
-
-        a, batch_a = self.rand_batch(4, ())
-        b, batch_b = self.rand_batch(4, ())
-        res_batch = batch_if(batch_a, batch_b)
-        res = [single_if(a[j], b[j]) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
-        script_if = torch.jit.script(single_if)
-        graph = torch.to_batch_graph(script_if.graph)
-        self.assertExpected(str(graph))
-
-    def test_while(self):
-        def single_while(a, b):
-            while a > b:
-                a = a - b
-            return a
-
-        batch_while = torch.jit.batch(batch_size=4)(single_while)
-
-        a, batch_a = self.rand_batch(4, ())
-        b = [torch.abs(torch.rand(1)) for i in range(4)]
-        batch_b = BatchTensor(b, torch.tensor([]).byte())
-        res_batch = batch_while(batch_a, batch_b)
-        res = [single_while(a[j], b[j]) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
-        script_while = torch.jit.script(single_while)
-        graph = torch.to_batch_graph(script_while.graph)
-        self.assertExpected(str(graph))
-
-    def test_for(self):
-        def single_for(x, y):
-            for _ in range(10):
-                x = x + y
-            return x
-
-        batch_for = torch.jit.batch(batch_size=4)(single_for)
-
-        a, batch_a = self.rand_batch(4, ())
-        b, batch_b = self.rand_batch(4, ())
-        res_batch = batch_for(batch_a, batch_b)
-        res = [single_for(a[j], b[j]) for j in range(4)]
-        self.assertEqual(res, res_batch.examples())
-
-        script_for = torch.jit.script(single_for)
-        graph = torch.to_batch_graph(script_for.graph)
-        self.assertExpected(str(graph))
-
-    def test_lstm(self):
-        def LSTM(x_all, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c):
-            for i in range(x_all.size(1)):
-                x = x_all.select(1, i)
-                i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
-                f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f
-                o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o
-                # activations
-                i_t = torch.sigmoid(i_t)
-                f_t = torch.sigmoid(f_t)
-                o_t = torch.sigmoid(o_t)
-                # cell computations
-                c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c
-                c_t = torch.tanh(c_t)
-                c_t = torch.mul(c_t, f_t) + torch.mul(i_t, c_t)
-                h_t = torch.mul(o_t, torch.tanh(c_t))
-                h = h_t
-                c = c_t
-            return h
-
-        LSTM_batch = torch.jit.batch(batch_size=4)(LSTM)
+        def LSTMCell_batch(x, h, c, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c):
+            i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
+            f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f
+            o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o
+            # activations
+            i_t = torch.sigmoid(i_t)
+            f_t = torch.sigmoid(f_t)
+            o_t = torch.sigmoid(o_t)
+            # cell computations
+            c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c
+            c_t = torch.tanh(c_t)
+            c_t = torch.mul(c, f_t) + torch.mul(i_t, c_t)
+            h_t = torch.mul(o_t, torch.tanh(c_t))
+            return h_t
 
         batch_size, input_size, hidden_size = 4, 3, 2
-        xs, batch = self.rand_batch(batch_size, (True, 4), (False, input_size))
-        hx, h_batch = self.rand_batch(batch_size, (False, hidden_size))
-        cx, c_batch = self.rand_batch(batch_size, (False, hidden_size))
-
-        # input to hidden weights
-        w_xi = torch.rand(input_size, hidden_size)
-        w_xf = torch.rand(input_size, hidden_size)
-        w_xo = torch.rand(input_size, hidden_size)
-        w_xc = torch.rand(input_size, hidden_size)
-        # hidden to hidden weights
-        w_hi = torch.rand(hidden_size, hidden_size)
-        w_hf = torch.rand(hidden_size, hidden_size)
-        w_ho = torch.rand(hidden_size, hidden_size)
-        w_hc = torch.rand(hidden_size, hidden_size)
-        # bias terms
-        b_i = torch.rand(hidden_size)
-        b_f = torch.rand(hidden_size)
-        b_o = torch.rand(hidden_size)
-        b_c = torch.rand(hidden_size)
-
-        ys = [LSTM(xs[j], hx[j], cx[j], w_xi, w_xf, w_xo, w_xc,
-                   w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c) for j in range(batch_size)]
-        ybs = LSTM_batch(batch, h_batch, c_batch, w_xi, w_xf, w_xo, w_xc,
-                         w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c)
-        self.assertEqual(ys, ybs.examples())
-
-    def test_greedy_search(self):
-        def greedy(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc,
-                   b_i, b_f, b_o, b_c, w_hs, b_s, iter_num):
-            iter_count = torch.zeros_like(iter_num)
-            while(iter_count < iter_num):
-                iter_count += 1
-                # LSTM Cell
-                i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
-                f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f
-                o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o
-                # activations
-                i_t = torch.sigmoid(i_t)
-                f_t = torch.sigmoid(f_t)
-                o_t = torch.sigmoid(o_t)
-                # cell computations
-                c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c
-                c_t = torch.tanh(c_t)
-                c_t = torch.mul(c_t, f_t) + torch.mul(i_t, c_t)
-                h_t = torch.mul(o_t, torch.tanh(c_t))
-                h = h_t
-                c = c_t
-                # calculate feature with max probability
-                s_t = torch.matmul(h_t, w_hs) + b_s
-                p_t = torch.softmax(s_t, 1)
-                i_t = torch.argmax(p_t, 1)
-                x = embed.index_select(1, i_t).squeeze(1)
-            return h
-
-        greedy_batch = torch.jit.batch(batch_size=4)(greedy)
-
-        batch_size, input_size, hidden_size, vocab_size = 4, 6, 8, 7
         xs, batch = self.rand_batch(batch_size, (False, input_size))
         hx, h_batch = self.rand_batch(batch_size, (False, hidden_size))
         cx, c_batch = self.rand_batch(batch_size, (False, hidden_size))
-        embed, embed_batch = self.rand_batch(batch_size, (False, vocab_size), (False, input_size))
-        iter_num = [torch.randint(2, 5, (1,)) for i in range(batch_size)]
-        iter_num_batch = BatchTensor(iter_num, torch.tensor([]).byte())
 
         # input to hidden weights
         w_xi = torch.rand(input_size, hidden_size)
@@ -1651,102 +1285,11 @@ def greedy(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc,
         b_f = torch.rand(hidden_size)
         b_o = torch.rand(hidden_size)
         b_c = torch.rand(hidden_size)
-        # hidden to vocab weights, bias
-        w_hs = torch.rand(hidden_size, vocab_size)
-        b_s = torch.rand(vocab_size)
-
-        ys = [greedy(xs[j], hx[j], cx[j], embed[j], w_xi, w_xf, w_xo, w_xc,
-                     w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c, w_hs, b_s, iter_num[j]) for j in range(batch_size)]
-        ybs = greedy_batch(batch, h_batch, c_batch, embed_batch, w_xi, w_xf, w_xo, w_xc,
-                           w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c, w_hs, b_s, iter_num_batch)
-        self.assertEqual(ys, ybs.examples())
 
-    def test_beam_search(self):
-        def beam(x, h, c, embed, w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc,
-                 b_i, b_f, b_o, b_c, w_hs, b_s, iter_num, idx):
-            k = 5
-            vocab_size = embed.size(1)
-            iter_count = torch.zeros_like(iter_num)
-            max_len = idx.size(2)
-            while(iter_count < iter_num):
-                iter_count += 1
-                # LSTM Cell
-                i_t = torch.matmul(x, w_xi) + torch.matmul(h, w_hi) + b_i
-                f_t = torch.matmul(x, w_xf) + torch.matmul(h, w_hf) + b_f
-                o_t = torch.matmul(x, w_xo) + torch.matmul(h, w_ho) + b_o
-                # activations
-                i_t = torch.sigmoid(i_t)
-                f_t = torch.sigmoid(f_t)
-                o_t = torch.sigmoid(o_t)
-                # cell computations
-                c_t = torch.matmul(x, w_xc) + torch.matmul(h, w_hc) + b_c
-                c_t = torch.tanh(c_t)
-                c_t = torch.mul(c_t, f_t) + torch.mul(i_t, c_t)
-                h_t = torch.mul(o_t, torch.tanh(c_t))
-                h = h_t
-                c = c_t
-                # calculate features with max probability
-                s_t = torch.matmul(h_t, w_hs) + b_s
-                s_t = s_t.view([1, s_t.size(1) * s_t.size(2)])
-                p_t = torch.softmax(s_t, 1)
-                prob_t, idx_t = torch.topk(p_t, k, 1)
-                if(int(idx_t.dim()) > 1):
-                    idx_t_tmp = idx_t.squeeze(0)
-                else:
-                    idx_t_tmp = idx_t
-                new_y = torch.fmod(idx_t_tmp, vocab_size)
-                pre_y = idx_t_tmp / vocab_size
-                x = embed.index_select(1, new_y)
-                h = h_t.index_select(1, pre_y)
-                c = c_t.index_select(1, pre_y)
-                iter = int(iter_count[0])
-                idx = torch.cat([idx.narrow(2, 0, iter).index_select(1, pre_y),
-                                torch.fmod(idx_t, vocab_size).unsqueeze(-1),
-                                idx.narrow(2, iter, max_len - iter)], 2)
-                idx = idx.narrow(2, 0, max_len)
-            return idx
-
-        beam_batch = torch.jit.batch(batch_size=4)(beam)
-
-        k = 5
-        batch_size, input_size, hidden_size, vocab_size = 4, 6, 8, 7
-        max_len = 5
-        xs, batch = self.rand_batch(batch_size, (False, 1), (False, input_size))
-        hx, h_batch = self.rand_batch(batch_size, (False, 1), (False, hidden_size))
-        cx, c_batch = self.rand_batch(batch_size, (False, 1), (False, hidden_size))
-        embed, embed_batch = self.rand_batch(batch_size, (False, vocab_size), (False, input_size))
-        iter_num = [torch.randint(2, max_len + 1, (1,)) for i in range(batch_size)]
-        iter_num_batch = BatchTensor(iter_num, torch.tensor([]).byte())
-
-        # input to hidden weights
-        w_xi = torch.rand(input_size, hidden_size)
-        w_xf = torch.rand(input_size, hidden_size)
-        w_xo = torch.rand(input_size, hidden_size)
-        w_xc = torch.rand(input_size, hidden_size)
-        # hidden to hidden weights
-        w_hi = torch.rand(hidden_size, hidden_size)
-        w_hf = torch.rand(hidden_size, hidden_size)
-        w_ho = torch.rand(hidden_size, hidden_size)
-        w_hc = torch.rand(hidden_size, hidden_size)
-        # bias terms
-        b_i = torch.rand(1, hidden_size)
-        b_f = torch.rand(1, hidden_size)
-        b_o = torch.rand(1, hidden_size)
-        b_c = torch.rand(1, hidden_size)
-        # hidden to vocab weights, bias
-        w_hs = torch.rand(hidden_size, vocab_size)
-        b_s = torch.rand(1, vocab_size)
-
-        idx_batch = torch.jit.BatchTensor(torch.zeros([batch_size, k, max_len], dtype=torch.long),
-                                          torch.zeros([batch_size, 1, max_len]).byte(),
-                                          torch.tensor([0, 1]).byte())
-        idx = [torch.zeros([1, k, max_len], dtype=torch.long) for _ in range(batch_size)]
-
-        ys = [beam(xs[j], hx[j], cx[j], embed[j], w_xi, w_xf, w_xo, w_xc, w_hi, w_hf, w_ho, w_hc,
-                   b_i, b_f, b_o, b_c, w_hs, b_s, iter_num[j], idx[j]).narrow(2, 0, int(iter_num[j]))
-              for j in range(batch_size)]
-        ybs = beam_batch(batch, h_batch, c_batch, embed_batch, w_xi, w_xf, w_xo, w_xc,
-                         w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c, w_hs, b_s, iter_num_batch, idx_batch)
+        ys = [LSTMCell(xs[j].squeeze(0), hx[j], cx[j], w_xi, w_xf, w_xo, w_xc,
+                       w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c) for j in range(batch_size)]
+        ybs = LSTMCell_batch(batch, h_batch, c_batch, w_xi, w_xf, w_xo, w_xc,
+                             w_hi, w_hf, w_ho, w_hc, b_i, b_f, b_o, b_c)
         self.assertEqual(ys, ybs.examples())
 
 
@@ -4105,10 +3648,10 @@ def test_unknown_builtin(self):
             def unknown_builtin(x):
                 return x.splork(3)
 
-    def test_return_tuple(self):
-        with self.assertRaisesRegex(RuntimeError, 'only supported return types'):
+    def test_expected_tensor_found_tuple(self):
+        with self.assertRaisesRegex(RuntimeError, 'expected a tensor value but found'):
             @torch.jit.script
-            def return_tuple(x):
+            def return_tuple_wrong(x):
                 a = (x, x)
                 return a, x
 
@@ -4827,17 +4370,6 @@ def tuple_arg(x):
                 # type: (Tuple[Tensor, Tensor]) -> Tensor
                 return x + 1
 
-    def test_script_non_tensor_args_outputs(self):
-        @torch.jit.script
-        def fn(x, y):
-            # type: (Tensor, float) -> float
-            return float((x + y).sum())
-
-        x = torch.ones(2, 2)
-        z = fn(x, 1)
-        self.assertIsInstance(z, float)
-        self.assertEqual(z, 8.)
-
     @unittest.skip('https://github.com/pytorch/pytorch/issues/9595')
     def test_inline_and_run_annotated_script_fn(self):
         @torch.jit.script
@@ -5380,9 +4912,11 @@ def forward(self, x, y):
     'test_expand_new_dim',
     'test_expand_new_dim_front_old_front_1',
     'test_expand_scalar_to_dims',
+    'test_expand_scalar_to_scalar',
     'test_expand_size',
     'test_permute',
     'test_permute_neg_dim',
+    'test_permute_scalar',
     'test_repeat',
     'test_repeat_scalar',
     'test_repeat_single_number',
@@ -5390,10 +4924,12 @@ def forward(self, x, y):
     'test_reshape',
     'test_reshape_1d',
     'test_reshape_scalar_to_1d',
+    'test_reshape_scalar_to_scalar',
     'test_reshape_size',
     'test_view',
     'test_view_1d',
     'test_view_scalar_to_1d',
+    'test_view_scalar_to_scalar',
     'test_view_size',
     'test_split_dim',
     'test_split_dim_neg0',
diff --git a/test/test_legacy_nn.py b/test/test_legacy_nn.py
index de65e6fc8ce7a0..1463d15cf22d0c 100644
--- a/test/test_legacy_nn.py
+++ b/test/test_legacy_nn.py
@@ -693,18 +693,14 @@ def _backward(self, module, input, output, grad_output, create_graph=False):
 
         return module.backward(input, grad_output)
 
-    def _forward_criterion(self, criterion, input, target, extra_args=None):
-        if extra_args is None:
-            extra_args = tuple()
+    def _forward_criterion(self, criterion, input, target):
         with torch.no_grad():
-            return criterion.forward(input, target, *extra_args)
+            return criterion.forward(input, target)
 
-    def _backward_criterion(self, criterion, input, target, gradOutput=None, extra_args=None):
-        if extra_args is None:
-            extra_args = tuple()
+    def _backward_criterion(self, criterion, input, target, gradOutput=None):
         # Ignore gradOutput. It's used for non-legacy tests.
         with torch.no_grad():
-            return criterion.backward(input, target, *extra_args)
+            return criterion.backward(input, target)
 
     def _zero_grad_parameters(self, module):
         return module.zeroGradParameters()
diff --git a/test/test_nn.py b/test/test_nn.py
index 8682463cf9bc6c..ccd698747ae8d5 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -36,7 +36,7 @@
     TEST_CUDNN_VERSION
 from common_nn import NNTestCase, ModuleTest, CriterionTest, TestBase, \
     module_tests, criterion_tests, loss_reference_fns, get_reduction, \
-    get_weight, smoothl1loss_reference, kldivloss_reference, ctcloss_reference
+    get_weight, smoothl1loss_reference, kldivloss_reference
 
 
 if TEST_SCIPY:
@@ -383,8 +383,6 @@ class NewCriterionTest(InputVariableMixin, CriterionTest):
     def __init__(self, *args, **kwargs):
         super(NewCriterionTest, self).__init__(*args, **kwargs)
         self.check_gradgrad = kwargs.get('check_gradgrad', True)
-        self.check_half = kwargs.get('check_half', True)
-        self.convert_target = kwargs.get('convert_target', True)
 
     def _do_extra_tests(self, test_case, module, input, target):
         if not self.check_gradgrad:
@@ -409,7 +407,7 @@ def apply_fn(input1, input2, *params):
         gradcheck(apply_fn, inputs)
         gradgradcheck(apply_fn, inputs)
 
-    def test_cuda(self, test_case, dtype=None, extra_args=None):
+    def test_cuda(self, test_case, dtype=None):
         def convert_dtype(obj, dtype, requires_grad=False):
             if isinstance(obj, torch.Tensor):
                 return torch.tensor(obj.data, dtype=dtype, requires_grad=requires_grad)
@@ -432,7 +430,7 @@ def convert_dtype(obj, dtype, requires_grad=False):
             if dtype is not None:
                 cpu_input = convert_dtype(cpu_input, dtype, True)
                 # NLLLoss requires target to be LongTensor
-                if not isinstance(cpu_target, torch.LongTensor) and self.convert_target:
+                if not isinstance(cpu_target, torch.LongTensor):
                     cpu_target = convert_dtype(cpu_target, dtype)
                 cpu_module.type(dtype)
                 gpu_module.type(dtype)
@@ -449,13 +447,13 @@ def convert_dtype(obj, dtype, requires_grad=False):
                 # Loss modules with weights require consistent input/module weight types
                 cpu_module = self.constructor(*self.constructor_args)
 
-            cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
-            gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args)
+            cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target)
+            gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target)
             # dtype can be None, so set precision in this way instead of a precision map
             test_case.assertEqual(cpu_output, gpu_output, 1e-1 if dtype == torch.half else 4e-4)
 
-            cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
-            gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args)
+            cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target)
+            gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target)
             test_case.assertEqual(cpu_gradInput, gpu_gradInput, 1e-1 if dtype == torch.half else 4e-4)
         except NotImplementedError:
             pass
@@ -467,10 +465,6 @@ def _get_target(self):
     def constructor_args(self):
         return self._get_arg('constructor_args', False)
 
-    @property
-    def extra_args(self):
-        return self._get_arg('extra_args', False)
-
 
 class TestNN(NNTestCase):
     _do_cuda_memory_leak_check = True
@@ -485,24 +479,20 @@ def _backward(self, module, input, output, grad_output, create_graph=False):
             return None
         return input.grad.data
 
-    def _forward_criterion(self, criterion, input, target, extra_args=None):
-        if extra_args is None:
-            extra_args = tuple()
+    def _forward_criterion(self, criterion, input, target):
         if isinstance(input, tuple):
-            args = input + (target,) + extra_args
+            args = input + (target,)
             output = criterion(*args)
         else:
-            output = criterion(input, target, *extra_args)
+            output = criterion(input, target)
         return output.item()
 
-    def _backward_criterion(self, criterion, input, target, gradOutput=None, extra_args=None):
-        if extra_args is None:
-            extra_args = tuple()
+    def _backward_criterion(self, criterion, input, target, gradOutput=None):
         input_tuple = input if isinstance(input, tuple) else (input,)
         for i in input_tuple:
             if i.grad is not None:
                 i.grad.data.zero_()
-        args = input_tuple + (target,) + extra_args
+        args = input_tuple + (target,)
         if gradOutput is None:
             gradOutput = torch.ones(())
         criterion(*args).backward(gradOutput.type_as(input_tuple[0]))
@@ -1595,7 +1585,6 @@ def test(nonlinearity, *args, **kwargs):
         test('relu6')
         test('elu')
         test('selu')
-        test('celu')
         test('rrelu')
         test('rrelu', inplace=True)
         test('hardtanh')
@@ -3589,19 +3578,6 @@ def test_NLLLoss_mismatched_batch(self):
         with self.assertRaisesRegex(ValueError, 'Expected.*batch_size'):
             F.nll_loss(x, t)
 
-    @unittest.skipIf(not (TEST_CUDNN and TEST_CUDNN_VERSION >= 7000), "needs cudnn >= 7.0")
-    def test_CTCLoss_cudnn(self):
-        target_lengths = [30, 25, 20]
-        input_lengths = [50, 50, 50]
-        targets = torch.randint(1, 15, (sum(target_lengths),), dtype=torch.int)
-        log_probs = torch.randn(50, 3, 15, dtype=torch.float, device='cuda').log_softmax(2)
-        res = torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths)
-        expected = ctcloss_reference(log_probs, targets.cuda(), input_lengths, target_lengths).float()
-        with torch.backends.cudnn.flags(enabled=False):
-            res2 = torch.nn.functional.ctc_loss(log_probs, targets.cuda().long(), input_lengths, target_lengths)
-        self.assertEqual(res, expected)
-        self.assertEqual(res2, res)
-
     def test_RNN_cell_no_broadcasting(self):
         def test(cell_module, input, hx, input_size, hidden_size):
             cell = cell_module(input_size, hidden_size)
@@ -4375,7 +4351,7 @@ def _verify_pixel_shuffle(self, input, output, upscale_factor):
                     self.assertEqual(output[:, c, h, w], input[:, channel_idx, height_idx, weight_idx])
 
     def test_inplace_thnn(self):
-        modules = [nn.ReLU, nn.ELU, nn.SELU, nn.CELU, nn.RReLU]
+        modules = [nn.ReLU, nn.ELU, nn.SELU, nn.RReLU]
         for mod in modules:
             r = mod(inplace=True)
             input = torch.randn(5, 5, requires_grad=True)
@@ -4836,12 +4812,6 @@ def test_triplet_margin_loss_swap_no_reduce(self):
         self.assertEqual(F.triplet_margin_loss(input1, input2, input3, swap=True, reduction='none'),
                          loss_reference_fns['TripletMarginLoss'](input1, input2, input3, swap=True, reduction='none'))
 
-    def test_pointwise_loss_target_grad_none_reduction(self):
-        i = torch.randn(5, 10)
-        t = torch.randn(5, 10, requires_grad=True)
-        self.assertEqual(F.mse_loss(i, t, reduction='none').size(), t.size())
-        self.assertEqual(F.l1_loss(i, t, reduction='none').size(), t.size())
-
     def test_cosine_similarity(self):
         input1 = torch.randn(4, 4, requires_grad=True)
         input2 = torch.randn(4, 4, requires_grad=True)
@@ -4872,30 +4842,30 @@ def test_grid_sample(self):
         def test_cpu_against_cuda(N, C, H, W, padding_mode):
             def test_shape(N, C, IH, IW, H, W, padding_mode):
 
-                input_cpu = torch.randn(C, N, IH, IW).transpose(0, 1).requires_grad_()
-                grid_cpu = torch.randn(H, N, W, 2).transpose(0, 1).requires_grad_()
+                input_cpu = Variable(torch.randn(C, N, IH, IW).transpose(0, 1), requires_grad=True)
+                grid_cpu = Variable(torch.randn(H, N, W, 2).transpose(0, 1), requires_grad=True)
                 out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode)
                 self.assertTrue(out_cpu.size() == torch.Size([N, C, H, W]))
 
-                input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
-                grid_cuda = grid_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
+                input_cuda = Variable(input_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
+                grid_cuda = Variable(grid_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
                 out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode)
                 self.assertEqual(out_cpu, out_cuda)
 
-                gradients = torch.randn_like(out_cpu)
+                gradients = out_cpu.data.new(out_cpu.size()).normal_()
                 out_cpu.backward(gradients)
                 out_cuda.backward(gradients.cuda())
                 self.assertEqual(input_cpu.grad, input_cuda.grad)
                 self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5)
 
                 # check that zero-dimensional input strides don't error out
-                base_input = torch.randn(N, C, 1, IW)
-                input_cpu = base_input.expand_as(input_cuda).requires_grad_()
+                base_input = torch.randn(C, IH, IW)
+                input_cpu = Variable(base_input.expand(input_cuda.size()), requires_grad=True)
                 grid_cpu = torch.randn(N, H, W, 2, requires_grad=True)
                 out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode)
 
-                input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_()
-                grid_cuda = grid_cpu.detach().cuda().requires_grad_()
+                input_cuda = Variable(base_input.cuda().expand(input_cuda.size()), requires_grad=True)
+                grid_cuda = Variable(grid_cpu.data.cuda(), requires_grad=True)
                 out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode)
                 self.assertEqual(out_cpu, out_cuda)
 
@@ -4903,21 +4873,21 @@ def test_shape(N, C, IH, IW, H, W, padding_mode):
             test_shape(N, C, H, W, H, W, padding_mode)
 
             # test larger output
-            N = random.randint(2, 8)
-            C = random.randint(2, 8)
-            IH = random.randint(2, 8)
-            IW = random.randint(2, 8)
+            N = random.randint(1, 8)
+            C = random.randint(1, 8)
+            IH = random.randint(1, 8)
+            IW = random.randint(1, 8)
             H = random.randint(IH + 1, 12)
             W = random.randint(IW + 1, 12)
             test_shape(N, C, IH, IW, H, W, padding_mode)
 
             # test smaller output
-            N = random.randint(2, 8)
-            C = random.randint(2, 8)
-            IH = random.randint(2, 8)
-            IW = random.randint(2, 8)
-            H = random.randint(2, IH)
-            W = random.randint(2, IW)
+            N = random.randint(1, 8)
+            C = random.randint(1, 8)
+            IH = random.randint(1, 8)
+            IW = random.randint(1, 8)
+            H = random.randint(1, IH)
+            W = random.randint(1, IW)
             test_shape(N, C, IH, IW, H, W, padding_mode)
 
         # test known input on CPU
@@ -4956,38 +4926,42 @@ def test_shape(N, C, IH, IW, H, W, padding_mode):
             # test CUDA against CPU
             if TEST_CUDA:
                 test_cpu_against_cuda(N, C, H, W, padding_mode)
-                if TEST_CUDNN:
-                    with cudnn.flags(enabled=False):
-                        test_cpu_against_cuda(N, C, H, W, padding_mode)
+
+                # test channels >1024, which doesn't work on cudnn 7102 and further
+                N, C, H, W = 1, 1025, 3, 3
+                self.assertTrue(gradcheck(
+                    lambda inp, grid: F.grid_sample(inp, grid, padding_mode=padding_mode),
+                    (input, grid)))
+                test_cpu_against_cuda(N, C, H, W, padding_mode)
 
     def test_grid_sample_3d(self):
         def test_cpu_against_cuda(N, C, D, H, W, padding_mode):
             def test_shape(N, C, ID, IH, IW, D, H, W, padding_mode):
 
-                input_cpu = torch.randn(C, N, ID, IH, IW).transpose(0, 1).requires_grad_()
-                grid_cpu = torch.randn(D, N, H, W, 3).transpose(0, 1).requires_grad_()
+                input_cpu = Variable(torch.randn(C, N, ID, IH, IW).transpose(0, 1), requires_grad=True)
+                grid_cpu = Variable(torch.randn(D, N, H, W, 3).transpose(0, 1), requires_grad=True)
                 out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode)
                 self.assertTrue(out_cpu.size() == torch.Size([N, C, D, H, W]))
 
-                input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
-                grid_cuda = grid_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_()
+                input_cuda = Variable(input_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
+                grid_cuda = Variable(grid_cpu.data.transpose(0, 1).cuda().transpose(0, 1), requires_grad=True)
                 out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode)
                 self.assertEqual(out_cpu, out_cuda)
 
-                gradients = torch.randn_like(out_cpu)
+                gradients = out_cpu.data.new(out_cpu.size()).normal_()
                 out_cpu.backward(gradients)
                 out_cuda.backward(gradients.cuda())
                 self.assertEqual(input_cpu.grad, input_cuda.grad)
                 self.assertEqual(grid_cpu.grad, grid_cuda.grad, prec=5e-5)
 
                 # check that zero-dimensional input strides don't error out
-                base_input = torch.randn(N, C, 1, IH, IW)
-                input_cpu = base_input.expand_as(input_cuda).requires_grad_()
+                base_input = torch.randn(C, ID, IH, IW)
+                input_cpu = Variable(base_input.expand(input_cuda.size()), requires_grad=True)
                 grid_cpu = torch.randn(N, D, H, W, 3, requires_grad=True)
                 out_cpu = F.grid_sample(input_cpu, grid_cpu, padding_mode=padding_mode)
 
-                input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_()
-                grid_cuda = grid_cpu.detach().cuda().requires_grad_()
+                input_cuda = Variable(base_input.cuda().expand(input_cuda.size()), requires_grad=True)
+                grid_cuda = Variable(grid_cpu.data.cuda(), requires_grad=True)
                 out_cuda = F.grid_sample(input_cuda, grid_cuda, padding_mode=padding_mode)
                 self.assertEqual(out_cpu, out_cuda)
 
@@ -4995,35 +4969,35 @@ def test_shape(N, C, ID, IH, IW, D, H, W, padding_mode):
             test_shape(N, C, D, H, W, D, H, W, padding_mode)
 
             # test larger output
-            N = random.randint(2, 8)
-            C = random.randint(2, 8)
-            ID = random.randint(2, 8)
-            IH = random.randint(2, 8)
-            IW = random.randint(2, 8)
+            N = random.randint(1, 8)
+            C = random.randint(1, 8)
+            ID = random.randint(1, 8)
+            IH = random.randint(1, 8)
+            IW = random.randint(1, 8)
             D = random.randint(ID + 1, 12)
             H = random.randint(IH + 1, 12)
             W = random.randint(IW + 1, 12)
             test_shape(N, C, ID, IH, IW, D, H, W, padding_mode)
 
             # test smaller output
-            N = random.randint(2, 8)
-            C = random.randint(2, 8)
-            ID = random.randint(2, 8)
-            IH = random.randint(2, 8)
-            IW = random.randint(2, 8)
-            D = random.randint(2, ID)
-            H = random.randint(2, IH)
-            W = random.randint(2, IW)
+            N = random.randint(1, 8)
+            C = random.randint(1, 8)
+            ID = random.randint(1, 8)
+            IH = random.randint(1, 8)
+            IW = random.randint(1, 8)
+            D = random.randint(1, ID)
+            H = random.randint(1, IH)
+            W = random.randint(1, IW)
             test_shape(N, C, ID, IH, IW, D, H, W, padding_mode)
 
         # test known input on CPU
         for padding_mode in ['zeros', 'border']:
             # do gradcheck
-            N = random.randint(2, 8)
-            C = random.randint(2, 8)
-            D = random.randint(2, 8)
-            H = random.randint(2, 8)
-            W = random.randint(2, 8)
+            N = random.randint(1, 8)
+            C = random.randint(1, 8)
+            D = random.randint(1, 8)
+            H = random.randint(1, 8)
+            W = random.randint(1, 8)
             input = torch.randn(N, C, D, H, W, requires_grad=True)
             grid = torch.randn(N, D, H, W, 3, requires_grad=True)
             self.assertTrue(gradcheck(
@@ -5566,11 +5540,6 @@ def test_unfold_invalid_arg(self):
             unfold = nn.Unfold(kernel_size=(1, 3), padding=(1, 1), dilation=(1, 2))
             unfold(torch.randn(1, 2, 2, 2))
 
-    def test_softmin(self):
-        x = torch.randn(2, 16)
-        self.assertEqual(F.softmin(x, 1), F.softmax(-x, 1))
-        self.assertEqual(F.softmin(x, 0), F.softmax(-x, 0))
-
     def test_adaptive_log_softmax(self):
         # args validation
         with self.assertRaises(ValueError):
@@ -6037,20 +6006,15 @@ def add(test_name, fn):
     add(test_name, lambda self, test=test: test(self))
     cuda_test_name = test_name + '_cuda'
     # With dtype enable, it's good enough to test against three floating types
-    kwargs = {}
-    if 'extra_args' in get_function_arglist(test.test_cuda):
-        kwargs['extra_args'] = test.extra_args
-
     if 'dtype' in get_function_arglist(test.test_cuda):
         add(cuda_test_name + '_float', lambda self,
-            test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.float, **kwargs))
+            test=test: test.test_cuda(self, dtype=torch.float))
         add(cuda_test_name + '_double', lambda self,
-            test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.double, **kwargs))
-        if getattr(test, 'check_half', True):
-            add(cuda_test_name + '_half', lambda self,
-                test=test: test.test_cuda(self, dtype=torch.half, **kwargs))
+            test=test: test.test_cuda(self, dtype=torch.double))
+        add(cuda_test_name + '_half', lambda self,
+            test=test: test.test_cuda(self, dtype=torch.half))
     else:
-        add(cuda_test_name, lambda self, test=test, kwargs=kwargs: test.test_cuda(self, **kwargs))
+        add(cuda_test_name, lambda self, test=test: test.test_cuda(self))
 
 
 def wrap_functional(fn, **kwargs):
@@ -6210,45 +6174,6 @@ def forward(self, *args):
         check_sum_reduction=True,
         check_gradgrad=False,
     ),
-    dict(
-        module_name='CTCLoss',
-        constructor_args=(14,),  # blank=14
-        extra_args=([50, 50, 50], [30, 25, 20]),  # input_lengths, target_lengths
-        input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
-        target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long),
-        reference_fn=lambda i, t, il, tl, m:
-            ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        check_half=False,
-    ),
-    dict(
-        module_name='CTCLoss',
-        desc='1d_target',
-        constructor_args=(14,),  # blank=14
-        extra_args=([50, 50, 50], [30, 25, 20]),  # input_lengths, target_lengths
-        input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
-        target_fn=lambda: torch.randint(0, 14, (3, 30), dtype=torch.long),
-        reference_fn=lambda i, t, il, tl, m:
-            ctcloss_reference(i, t, il, tl, blank=14, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        check_half=False,
-    ),
-    dict(
-        module_name='CTCLoss',
-        desc='2d_int_target',
-        constructor_args=(0,),  # blank=0
-        extra_args=([50, 50, 50], [30, 25, 20]),  # input_lengths, target_lengths
-        input_fn=lambda: torch.randn(50, 3, 15).log_softmax(2),
-        target_fn=lambda: torch.randint(1, 15, (3, 30), dtype=torch.int),
-        reference_fn=lambda i, t, il, tl, m:
-            ctcloss_reference(i, t, il, tl, blank=0, reduction=get_reduction(m)),
-        check_sum_reduction=True,
-        check_gradgrad=False,
-        check_half=False,
-        convert_target=False,
-    ),
 ]
 
 
@@ -7841,21 +7766,6 @@ def multimarginloss_weights_no_reduce_test():
         check_inplace=True,
         desc='scalar'
     ),
-    dict(
-        module_name='CELU',
-        input_size=(3, 2, 5),
-        constructor_args=(2.,),
-        check_inplace=True,
-        reference_fn=lambda x, _: torch.where(x >= 0, x, 2. * ((.5 * x).exp() - 1))
-    ),
-    dict(
-        module_name='CELU',
-        input_size=(),
-        constructor_args=(2.,),
-        check_inplace=True,
-        reference_fn=lambda x, _: torch.where(x >= 0, x, 2. * ((.5 * x).exp() - 1)),
-        desc='scalar'
-    ),
     dict(
         module_name='GLU',
         input_size=(5, 6),
diff --git a/test/test_optim.py b/test/test_optim.py
index 2d5b876dd3a8e1..41c3bfc1964f33 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -31,6 +31,7 @@ def wrapper(closure, params, state):
 
 
 class TestOptim(TestCase):
+
     def _test_rosenbrock(self, constructor, old_fn):
         params_t = torch.Tensor([1.5, 1.5])
         state = {}
@@ -504,20 +505,6 @@ def forward(self, x):
         return self.conv2(F.relu(self.conv1(x)))
 
 
-class LambdaLRTestObject:
-    def __init__(self, value):
-        self.value = value
-
-    def __call__(self, epoch):
-        return self.value * epoch
-
-    def __eq__(self, other):
-        if isinstance(other, self.__class__):
-            return self.__dict__ == other.__dict__
-        else:
-            return False
-
-
 class TestLRScheduler(TestCase):
     def setUp(self):
         self.net = SchedulerTestNet()
@@ -685,28 +672,6 @@ def test_reduce_lr_on_plateau_state_dict(self):
             if key not in {'optimizer', 'is_better'}:
                 self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key], allow_inf=True)
 
-    def test_lambda_lr_state_dict_fn(self):
-        scheduler = LambdaLR(self.opt, lr_lambda=lambda x: x)
-        state = scheduler.state_dict()
-        self.assertIsNone(state['lr_lambdas'][0])
-
-        scheduler_copy = LambdaLR(self.opt, lr_lambda=lambda x: x)
-        scheduler_copy.load_state_dict(state)
-        for key in scheduler.__dict__.keys():
-            if key not in {'optimizer', 'lr_lambdas'}:
-                self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key], allow_inf=True)
-
-    def test_lambda_lr_state_dict_obj(self):
-        scheduler = LambdaLR(self.opt, lr_lambda=LambdaLRTestObject(10))
-        state = scheduler.state_dict()
-        self.assertIsNotNone(state['lr_lambdas'][0])
-
-        scheduler_copy = LambdaLR(self.opt, lr_lambda=LambdaLRTestObject(-1))
-        scheduler_copy.load_state_dict(state)
-        for key in scheduler.__dict__.keys():
-            if key not in {'optimizer'}:
-                self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key], allow_inf=True)
-
     def _check_scheduler_state_dict(self, constr, constr2, epochs=10):
         scheduler = constr()
         for _ in range(epochs):
diff --git a/test/test_torch.py b/test/test_torch.py
index edd69473f8505b..2a8c897713111f 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -22,7 +22,7 @@
 from torch import multiprocessing as mp
 from common import TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_MKL, \
     TEST_LIBROSA, run_tests, download_file, skipIfNoLapack, suppress_warnings, \
-    IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, TEST_WITH_ROCM
+    IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, skipIfNoZeroSize, TEST_WITH_ROCM
 from multiprocessing.reduction import ForkingPickler
 
 if TEST_NUMPY:
@@ -866,6 +866,7 @@ def test_multidim(x, dim):
     def test_dim_reduction(self):
         self._test_dim_reduction(self, lambda t: t)
 
+    @skipIfNoZeroSize
     def test_reduction_empty(self):
         fns_to_test = [
             # name, function, identity
@@ -929,6 +930,7 @@ def test_reduction_empty(self):
             self.assertEqual(torch.ones((2, 1, 4), device=device), xb.all(1, keepdim=True))
             self.assertEqual(torch.ones((), device=device), xb.all())
 
+    @skipIfNoZeroSize
     def test_pairwise_distance_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -1688,7 +1690,6 @@ def test_einsum(self):
             ("...ii->...i", I),       # batch diagonal
             # -- Other
             ("bn,anm,bm->ba", l, w, r),  # as torch.bilinear
-            ("... ii->...i  ", I),       # batch diagonal with spaces
         ]
         for test in test_list:
             actual = torch.einsum(test[0], test[1:])
@@ -2239,6 +2240,7 @@ def test_tensor_factory_cuda_type(self):
         self.assertTrue(x.is_cuda)
         torch.set_default_tensor_type(saved_type)
 
+    @skipIfNoZeroSize
     def test_tensor_factories_empty(self):
         # ensure we can create empty tensors from each factory function
         shapes = [(5, 0, 1), (0,), (0, 0, 1, 0, 2, 0, 0)]
@@ -2925,6 +2927,7 @@ def _test_in_place_broadcastable(t0, t1, t2=None):
     def test_broadcast(self):
         self._test_broadcast(self, lambda t: t)
 
+    @skipIfNoZeroSize
     def test_broadcast_empty(self):
         # empty + empty
         self.assertRaises(RuntimeError, lambda: torch.randn(5, 0) + torch.randn(0, 5))
@@ -2940,17 +2943,6 @@ def test_broadcast_empty(self):
                          torch.randn(0, 7, 0, 6, 5, 0, 1) + torch.randn(1, 1, 5, 1, 7))
         self.assertRaises(RuntimeError, lambda: torch.randn(7, 0) + torch.randn(2, 1))
 
-    def test_broadcast_tensors(self):
-        x0 = torch.randn(2, 1, 3)
-        x1 = torch.randn(3)
-        x2 = torch.randn(3, 1)
-        expected_size = (2, 3, 3)
-
-        y0, y1, y2 = torch.broadcast_tensors(x0, x1, x2)
-        self.assertTrue(y0.size() == expected_size)
-        self.assertTrue(y1.size() == expected_size)
-        self.assertTrue(y2.size() == expected_size)
-
     @staticmethod
     def _test_contiguous(self, cast):
         x = cast(torch.randn(1, 16, 5, 5))
@@ -2965,7 +2957,9 @@ def test_contiguous(self):
         return self._test_contiguous(self, lambda t: t)
 
     def test_empty_tensor_props(self):
-        sizes = [(0,), (0, 3), (5, 0), (5, 0, 3, 0, 2), (0, 3, 0, 2), (0, 5, 0, 2, 0)]
+        sizes = [(0,)]
+        if torch._C._use_zero_size_dim():
+            sizes += [(0, 3), (5, 0), (5, 0, 3, 0, 2), (0, 3, 0, 2), (0, 5, 0, 2, 0)]
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for size in sizes:
             for device in devices:
@@ -3482,6 +3476,9 @@ def test_cat_empty_legacy(self):
 
     @staticmethod
     def _test_cat_empty(self, use_cuda=False):
+        if not torch._C._use_zero_size_dim():
+            return
+
         dtype = torch.float32
         device = 'cuda' if use_cuda else 'cpu'
 
@@ -3527,6 +3524,9 @@ def test_narrow(self):
         self.assertEqual(x.narrow(-2, -1, 1), torch.Tensor([[6, 7, 8]]))
 
     def test_narrow_empty(self):
+        if not torch._C._use_zero_size_dim():
+            return
+
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
             x = torch.randn(2, 3, 4, device=device)
@@ -3658,7 +3658,7 @@ def test_randn(self):
         self.assertEqual(res1, res2)
 
     def test_slice(self):
-        empty = torch.empty(0, 4)
+        empty = torch.empty(0, 4) if torch._C._use_zero_size_dim() else torch.Tensor()
         x = torch.arange(0., 16).view(4, 4)
         self.assertEqual(x[:], x)
         self.assertEqual(x[:4], x)
@@ -4951,7 +4951,10 @@ def consec(size, start=1):
         reference = conv_fn(consec((3, 3, 3)))
 
         # empty tensor indexing
-        self.assertEqual(reference[conv_fn(torch.LongTensor())], reference.new(0, 3, 3))
+        if torch._C._use_zero_size_dim():
+            self.assertEqual(reference[conv_fn(torch.LongTensor())], reference.new(0, 3, 3))
+        else:
+            self.assertEqual(reference[conv_fn(torch.LongTensor())], reference.new())
 
         self.assertEqual(reference[0], consec((3, 3)), 0)
         self.assertEqual(reference[1], consec((3, 3), 10), 0)
@@ -4997,9 +5000,14 @@ def consec(size, start=1):
         self.assertEqual(reference[None, 2:5, None, None], reference.unsqueeze(0)[:, 2:5].unsqueeze(2).unsqueeze(2))
 
         # indexing 0-length slice
-        self.assertEqual(torch.empty(0, 5, 5), reference[slice(0)])
-        self.assertEqual(torch.empty(0, 5), reference[slice(0), 2])
-        self.assertEqual(torch.empty(0, 5), reference[2, slice(0)])
+        if torch._C._use_zero_size_dim():
+            self.assertEqual(torch.empty(0, 5, 5), reference[slice(0)])
+            self.assertEqual(torch.empty(0, 5), reference[slice(0), 2])
+            self.assertEqual(torch.empty(0, 5), reference[2, slice(0)])
+        else:
+            self.assertEqual(torch.tensor([]), reference[slice(0)])
+            self.assertEqual(torch.tensor([]), reference[slice(0), 2])
+            self.assertEqual(torch.tensor([]), reference[2, slice(0)])
         self.assertEqual(torch.tensor([]), reference[2, 1:1, 2])
 
         # indexing with step
@@ -5709,6 +5717,7 @@ def check(src, idx):
         check(src, idx)
         check(src.transpose(1, 2), idx)
 
+    @skipIfNoZeroSize
     def test_take_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -5739,6 +5748,7 @@ def test_put_accumulate(self):
         dst.put_(idx, src, accumulate=True)
         self.assertEqual(dst.tolist(), [[5, 7], [1, 1]])
 
+    @skipIfNoZeroSize
     def test_put_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -6060,6 +6070,7 @@ def _test_view(self, cast):
     def test_view(self):
         TestTorch._test_view(self, lambda x: x)
 
+    @skipIfNoZeroSize
     def test_view_empty(self):
         x = torch.randn(0, 6)
         self.assertEqual((1, 0, 6, 1, 1), x.view(1, 0, 6, 1, 1).shape)
@@ -6085,8 +6096,12 @@ def test_reshape(self):
         self.assertEqual(empty, empty.reshape(-1))
         self.assertEqual(empty, empty.reshape([0]))
         # TODO: fix these once we have multi-dimensional empty tensors
-        self.assertEqual(empty.reshape([0, 1]).shape, (0, 1))
-        self.assertEqual(empty.reshape([1, -1]).shape, (1, 0))
+        if torch._C._use_zero_size_dim():
+            self.assertEqual(empty.reshape([0, 1]).shape, (0, 1))
+            self.assertEqual(empty.reshape([1, -1]).shape, (1, 0))
+        else:
+            self.assertEqual(empty.reshape([0, 1]).shape, (0,))
+            self.assertEqual(empty.reshape([1, -1]).shape, (0,))
         self.assertRaises(RuntimeError, lambda: empty.reshape(1))
 
         x = torch.randn(3, 3)
@@ -6094,6 +6109,7 @@ def test_reshape(self):
         self.assertEqual(x.data_ptr(), x.reshape_as(torch.rand(1, 9, 1)).data_ptr())
         self.assertRaises(RuntimeError, lambda: x.reshape_as(torch.rand(10)))
 
+    @skipIfNoZeroSize
     def test_empty_reshape(self):
         x = torch.randn(0, 6)
         self.assertEqual((1, 0, 6, 1, 1), x.reshape(1, 0, 6, 1, 1).shape)
@@ -6103,6 +6119,7 @@ def test_empty_reshape(self):
         # match NumPy semantics -- don't infer the size of dimension with a degree of freedom
         self.assertRaises(RuntimeError, lambda: x.reshape(0, -1))
 
+    @skipIfNoZeroSize
     def test_tensor_shape_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -6168,6 +6185,7 @@ def test_tensor_shape_empty(self):
             self.assertEqual([(0, 1, 3, 0)], [z.shape for z in torch.split(x, 0, dim=0)])
 
     # functions that operate over a dimension but don't reduce.
+    @skipIfNoZeroSize
     def test_dim_function_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -6291,6 +6309,7 @@ def test_dim_function_empty(self):
             c = torch.randn((0, 1, 2), device=device)
             self.assertEqual(c, c.index_select(0, ind_empty))
 
+    @skipIfNoZeroSize
     def test_blas_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -6360,6 +6379,7 @@ def fn(torchfn, *args):
             A_LU, pivots = fn(torch.btrifact, (2, 0, 0))
             self.assertEqual([(2, 0, 0), (2, 0)], [A_LU.shape, pivots.shape])
 
+    @skipIfNoZeroSize
     def test_blas_alpha_beta_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
@@ -6385,6 +6405,7 @@ def test_blas_alpha_beta_empty(self):
             self.assertEqual(torch.full((2, 3), beta * value, device=device),
                              torch.addmm(input=input, mat1=mat, mat2=mat2, alpha=alpha, beta=beta, out=out))
 
+    @skipIfNoZeroSize
     @skipIfNoLapack
     def test_lapack_empty(self):
         # FIXME: these are just a selection of LAPACK functions -- we need a general strategy here.
@@ -6875,6 +6896,9 @@ def test_nonzero(self):
                         self.assertNotEqual(tensor[dst1[i, 0], dst1[i, 1], dst1[i, 2]].item(), 0)
 
     def test_nonzero_empty(self):
+        if not torch._C._use_zero_size_dim():
+            return
+
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices:
             x = torch.randn(0, 2, 0, 5, 0, device=device)
@@ -7499,11 +7523,15 @@ def test_load_error_msg(self):
         expected_err_msg = (".*You can only torch.load from a file that is seekable. " +
                             "Please pre-load the data into a buffer like io.BytesIO and " +
                             "try to load from it instead.")
-
-        resource = FilelikeMock(data=b"data")
-        delattr(resource, "tell")
-        delattr(resource, "seek")
-        self.assertRaisesRegex(AttributeError, expected_err_msg, lambda: torch.load(resource))
+        if PY3:
+            import urllib.request
+            import io
+            resource = urllib.request.urlopen('https://download.pytorch.org/test_data/linear.pt')
+            self.assertRaisesRegex(io.UnsupportedOperation, expected_err_msg, lambda: torch.load(resource))
+        else:
+            import urllib
+            resource = urllib.urlopen('https://download.pytorch.org/test_data/linear.pt')
+            self.assertRaisesRegex(AttributeError, expected_err_msg, lambda: torch.load(resource))
 
     def test_from_buffer(self):
         a = bytearray([1, 2, 3, 4])
@@ -7866,7 +7894,10 @@ def test_from_numpy(self):
 
         # check zero dimensional
         x = np.zeros((0, 2))
-        self.assertEqual(torch.from_numpy(x).shape, (0, 2))
+        if torch._C._use_zero_size_dim():
+            self.assertEqual(torch.from_numpy(x).shape, (0, 2))
+        else:
+            self.assertEqual(torch.from_numpy(x).shape, (0,))
 
         # check ill-sized strides raise exception
         x = np.array([3., 5., 8.])
@@ -7916,20 +7947,6 @@ def test_ctor_with_numpy_array(self):
                 for i in range(len(array)):
                     self.assertEqual(tensor[i], array[i])
 
-    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
-    def test_ctor_with_numpy_scalar_ctor(self):
-        dtypes = [
-            np.double,
-            np.float,
-            np.float16,
-            np.int64,
-            np.int32,
-            np.int16,
-            np.uint8
-        ]
-        for dtype in dtypes:
-            self.assertEqual(dtype(42), torch.tensor(dtype(42)).item())
-
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_numpy_index(self):
         i = np.int32([0, 1, 2])
@@ -8017,17 +8034,6 @@ def test_numpy_array_interface(self):
             for i in range(len(x)):
                 self.assertEqual(geq2_x[i], geq2_array[i])
 
-    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
-    def test_multiplication_numpy_scalar(self):
-        np_sc = np.float64(2.0)
-        t = torch.ones(2, requires_grad=True)
-        r1 = np_sc * t
-        self.assertIsInstance(r1, torch.Tensor)
-        self.assertTrue(r1.requires_grad)
-        r2 = t * np_sc
-        self.assertIsInstance(r2, torch.Tensor)
-        self.assertTrue(r2.requires_grad)
-
     def test_error_msg_type_translation(self):
         with self.assertRaisesRegex(
                 RuntimeError,
diff --git a/third_party/eigen b/third_party/eigen
index cafae68f33f7f4..e9e95489a0b241 160000
--- a/third_party/eigen
+++ b/third_party/eigen
@@ -1 +1 @@
-Subproject commit cafae68f33f7f41270b2e8c2dd181f510aa4d918
+Subproject commit e9e95489a0b241412e31f0525e85b2fab386c786
diff --git a/third_party/onnx b/third_party/onnx
index 32ac71b1b9c1bd..c761845c7f6880 160000
--- a/third_party/onnx
+++ b/third_party/onnx
@@ -1 +1 @@
-Subproject commit 32ac71b1b9c1bd7f196eed3b311734ec6ab3c367
+Subproject commit c761845c7f6880ab7eb7e2866d673834c7149e89
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index a66cb77f8ce9dd..14fd6d7cf5e09c 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -201,9 +201,6 @@
 - name: conv_tbc(Tensor self, Tensor weight, Tensor bias, int64_t pad)
   self, weight, bias: conv_tbc_backward(grad, self, weight, bias, pad)
 
-- name: _ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank)
-  log_probs: _ctc_loss_backward(grad, log_probs, targets, input_lengths, target_lengths, result0, result1, blank)
-
 - name: det(Tensor self)
   self: det_backward(grad, self, result)
 
@@ -311,12 +308,6 @@
   self: gesv_backward_self(grad, self, A)
   A: gesv_backward_A(grad, self, A, result0)
 
-- name: grid_sampler_2d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode)
-  input, grid: grid_sampler_2d_backward(grad, input, grid, interpolation_mode, padding_mode)
-
-- name: grid_sampler_3d(Tensor input, Tensor grid, int64_t interpolation_mode, int64_t padding_mode)
-  input, grid: grid_sampler_3d_backward(grad, input, grid, interpolation_mode, padding_mode)
-
 - name: gt_(Tensor self, Scalar other)
   self: zeros_like(self)
 
@@ -811,8 +802,8 @@
 - name: relu(Tensor self)
   self: threshold_backward(grad, self, 0, 0)
 
-- name: elu_forward(Tensor self, Scalar alpha, Scalar scale, Scalar input_scale)
-  self: elu_backward(grad, alpha, scale, input_scale, output)
+- name: elu_forward(Tensor self, Scalar alpha, Scalar scale)
+  self: elu_backward(grad, alpha, scale, output)
 
 - name: glu_forward(Tensor self, int64_t dim)
   self: glu_backward(grad, self, dim)
@@ -983,6 +974,12 @@
 - name: thnn_conv_dilated3d_backward(Tensor grad_output, Tensor self, Tensor weight, IntList kernel_size, IntList stride, IntList padding, IntList dilation, Tensor columns, Tensor ones, std::array<bool,3> output_mask)
   grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, {{0, 0, 0}}, 1, false, false, false, grad_input_mask)
 
+- name: thnn_grid_sampler_bilinear2d_forward(Tensor self, Tensor grid, int64_t padding_mode)
+  self, grid: thnn_grid_sampler_bilinear2d_backward(grad, self, grid, padding_mode)
+
+- name: thnn_grid_sampler_bilinear3d_forward(Tensor self, Tensor grid, int64_t padding_mode)
+  self, grid: thnn_grid_sampler_bilinear3d_backward(grad, self, grid, padding_mode)
+
 # NN double backwards support
 
 - name: adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self)
@@ -1009,9 +1006,9 @@
   grad_output: avg_pool3d(grad, kernel_size, stride, padding, ceil_mode, count_include_pad)
   self: zeros_like(self)
 
-- name: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output)
-  grad_output: elu_backward(grad, alpha, scale, input_scale, output)
-  output: grad * grad_output * input_scale * (output < 0).toType(grad.type())
+- name: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Tensor output)
+  grad_output: elu_backward(grad, alpha, scale, output)
+  output: grad * grad_output * (output < 0).toType(grad.type())
 
 - name: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, IntList kernel_size, IntList output_size, Tensor indices)
   grad_output: max_pool_double_backward(grad, indices, 2)
@@ -1148,8 +1145,6 @@
   output: -2 * output * grad * grad_output
 
 # cudnn
-- name: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, IntList input_lengths, IntList target_lengths, int64_t blank, bool deterministic)
-  log_probs: result1
 
 - name: cudnn_convolution_transpose(Tensor self, Tensor weight, Tensor bias, IntList padding, IntList output_padding, IntList stride, IntList dilation, int64_t groups, bool benchmark, bool deterministic)
   self, weight, bias: cudnn_convolution_transpose_backward(self, grad, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, grad_input_mask)
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 2bee61b024317e..45af42655f96cc 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -340,8 +340,6 @@ def save_variables(saved_variables, is_output):
             elif arg['type'] == 'TensorList':
                 name += '_'
                 expr = 'make_saved_variable_list({})'.format(arg['name'])
-            elif arg['type'] == 'IntList':
-                expr = expr + ".vec()"
             stmts.append('grad_fn->{} = {};'.format(name, expr))
         return stmts
 
diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp
index 0622fae5f2e8e8..f859f814b4f8bc 100644
--- a/tools/autograd/templates/Functions.cpp
+++ b/tools/autograd/templates/Functions.cpp
@@ -175,7 +175,7 @@ Tensor prod_safe_zeros_backward(const Tensor &grad, const Tensor& inp, int64_t d
     return grad;
   }
 
-  auto ones_size = inp.sizes().vec();
+  std::vector<int64_t> ones_size(inp.sizes());
   ones_size[dim] = 1;
   Tensor ones = at::ones(ones_size, grad.type());
   Tensor exclusive_normal_nocp = at::cat({ones, inp.narrow(dim, 0, inp.size(dim) - 1)}, dim);
@@ -328,7 +328,7 @@ Tensor cumprod_backward(const Tensor &grad, const Tensor &input, int64_t dim) {
     return sum_scan_exclusive(result * grad, dim) / input;
   }
 
-  auto ones_size = input.sizes().vec();
+  std::vector<int64_t> ones_size(input.sizes());
   ones_size[dim] = 1;
   Tensor ones = at::ones({1}, grad.type()).expand(ones_size);
   Tensor grad_input = at::zeros(input.sizes(), grad.type());
@@ -461,7 +461,7 @@ Tensor mm_mat2_backward(const Tensor & grad, const Tensor & mat1, IntList sizes,
 }
 
 Tensor renorm_backward(const Tensor & grad, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
-  auto transposed_sizes = self.transpose(dim, 0).sizes().vec();
+  auto transposed_sizes = std::vector<int64_t>(self.transpose(dim, 0).sizes());
   auto flatten = [&](const Tensor & t) {
     return t.transpose(dim, 0).contiguous().view({t.size(dim), -1});
   };
@@ -637,7 +637,7 @@ Tensor split_with_sizes_backward(const std::vector<torch::autograd::Variable> &g
       grads_all_defined[j] = grads[j];
     } else {
       auto length = split_sizes[j];
-      auto grad_size = sizes.vec();
+      std::vector<int64_t> grad_size(sizes);
       grad_size[dim] = length;
       grads_all_defined[j] = at::zeros(grad_size, type);
     }
@@ -659,7 +659,7 @@ Tensor split_backward(const std::vector<torch::autograd::Variable> &grads,
 
 Tensor max_pool_double_backward(const Tensor & grad, const Tensor & indices, int dim) {
   AT_ASSERT(indices.dim() >= dim);
-  auto size = indices.sizes().slice(0, indices.dim() - dim).vec();
+  auto size = std::vector<int64_t>(indices.sizes().slice(0, indices.dim() - dim));
   size.push_back(-1);
   auto indices_view = indices.view(size);
   return grad.contiguous().view(size).gather(-1, indices_view).view(indices.sizes());
@@ -686,7 +686,7 @@ Tensor glu_double_backward(const Tensor & grad, const Tensor & grad_output, cons
 
 Tensor glu_double_backward_grad_output(const Tensor & grad, const Tensor & input, int64_t dim) {
   if (dim < 0) dim += input.dim();
-  auto sizes = input.sizes().vec();
+  std::vector<int64_t> sizes = input.sizes();
   sizes[dim] /= 2;
   auto tmp = grad * glu_backward(at::ones(sizes, input.type()), input, dim);
   return tmp.narrow(dim, 0, sizes[dim]) + tmp.narrow(dim, sizes[dim], sizes[dim]);
@@ -1545,27 +1545,27 @@ Tensor symeig_backward(const std::vector<torch::autograd::Variable> &grads, cons
                     bool eigenvectors, bool upper, const Tensor& lambda, const Tensor& v) {
     auto glambda = grads[0];
     auto gv = grads[1];
-
+    
     auto vt = v.t();
-
+    
     if (!eigenvectors) {
         throw std::runtime_error(std::string("cannot compute backward without "
                                              "computing eigenvectors in forward pass"));
     }
-
+    
     Tensor result;
     if (gv.defined()) {
         Tensor F = lambda.unsqueeze(0).expand_as(self).clone();
         F.sub_(at::unsqueeze(lambda, 1));
         F.diagonal().fill_(INFINITY);
         F.pow_(-1);
-
+        
         F.mul_(vt.mm(gv));
         result = v.mm(F.mm(vt));
     } else {
         result = at::zeros_like(self);
     }
-
+    
     if (glambda.defined()) {
         result.add_((v * glambda).mm(vt));
     }
diff --git a/tools/autograd/templates/Functions.h b/tools/autograd/templates/Functions.h
index 00d927f1fdf7f8..ae95bf7197770e 100644
--- a/tools/autograd/templates/Functions.h
+++ b/tools/autograd/templates/Functions.h
@@ -29,7 +29,7 @@ struct TypeAndSize {
   TypeAndSize() : type(nullptr) {}
   /* implicit */
   TypeAndSize(const Tensor & t)
-    : sizes(t.sizes().vec())
+    : sizes(t.sizes())
     , type(&t.type()) {}
 
   Tensor zeros() { return at::zeros(sizes, *type); }
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index bd4c59cfe9d380..2f1adf0ab59f4b 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -398,7 +398,7 @@ Tensor VariableType::contiguous(const Tensor & self) const {
 static std::vector<std::vector<int64_t>> to_args_sizes(TensorList tensors) {
   std::vector<std::vector<int64_t>> args_sizes(tensors.size());
   for (size_t i = 0; i < tensors.size(); ++i) {
-    args_sizes[i] = tensors[i].sizes().vec();
+    args_sizes[i] = tensors[i].sizes();
   }
   return args_sizes;
 }
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
index 8f79c2830e96c0..4a0dbd04c905f1 100755
--- a/tools/build_pytorch_libs.sh
+++ b/tools/build_pytorch_libs.sh
@@ -97,11 +97,7 @@ if [[ $(uname) == 'Darwin' ]]; then
     LDFLAGS="$LDFLAGS -Wl,-rpath,@loader_path"
     LD_POSTFIX=".dylib"
 else
-    if [[ $USE_ROCM -eq 1 ]]; then
-        LDFLAGS="$LDFLAGS -Wl,-rpath,\\\\\\\$ORIGIN"
-    else
-        LDFLAGS="$LDFLAGS -Wl,-rpath,\$ORIGIN"
-    fi
+    LDFLAGS="$LDFLAGS -Wl,-rpath,\$ORIGIN"
 fi
 CPP_FLAGS=" -std=c++11 "
 GLOO_FLAGS=""
diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py
index 77b101dedf0f3e..abbadc70691b46 100644
--- a/tools/clang_tidy.py
+++ b/tools/clang_tidy.py
@@ -7,7 +7,6 @@
 import subprocess
 import sys
 
-
 DEFAULT_FILE_PATTERN = r".*\.[ch](pp)?"
 
 # @@ -start,count +start,count @@
@@ -27,11 +26,6 @@ def run_shell_command(arguments, process_name=None):
         return output.decode()
 
 
-def normalize_directory_path(path):
-    """Normalizes a directory path."""
-    return path.rstrip('/')
-
-
 def transform_globs_into_regexes(globs):
     """Turns glob patterns into regular expressions."""
     return [glob.replace("*", ".*").replace("?", ".") for glob in globs]
@@ -55,37 +49,16 @@ def git_diff(args, verbose):
     return run_shell_command(command, process_name="git diff")
 
 
-def filter_files(files, file_patterns, verbose):
+def filter_files(files, file_patterns):
     """Returns all files that match any of the patterns."""
     filtered = []
     for file in files:
-        has_match = False
         for pattern in file_patterns:
-            if pattern.search(file):
+            if pattern.match(file):
                 filtered.append(file)
-                has_match = True
-        if not has_match and verbose:
-            message = "{} does not match any ".format(file)
-            message += "file pattern in {{{}}}".format(', '.join(map(str, file_patterns)))
-            print(message)
     return filtered
 
 
-def remove_recursive_files(files, paths, verbose):
-    """
-    Removes all files that are not immediately under one of the given paths.
-    """
-    for file in files:
-        if os.path.dirname(file) in paths:
-            yield file
-        else:
-            if verbose:
-
-                message = "{} ({}) does not match any ".format(file, os.path.dirname(file))
-                message += "non-recursive path in {{{}}}".format(", ".join(paths))
-                print(message)
-
-
 def get_changed_files(revision, paths, verbose):
     """Runs git diff to get the paths of all changed files."""
     # --diff-filter AMU gets us files that are (A)dded, (M)odified or (U)nmerged (in the working copy).
@@ -179,17 +152,7 @@ def parse_options():
     )
     parser.add_argument("-r", "--revision", help="Git revision to get changes from")
     parser.add_argument(
-        "-p",
-        "--paths",
-        nargs="+",
-        default=["."],
-        help="Lint only the given paths (recursively)",
-    )
-    parser.add_argument(
-        "-n",
-        "--no-recursive",
-        action="store_true",
-        help="If paths are supplied with -p/--paths, do not recurse into paths",
+        "-p", "--paths", nargs="+", default=["."], help="Lint only the given paths"
     )
     parser.add_argument(
         "-s",
@@ -210,15 +173,12 @@ def parse_options():
 
 def main():
     options = parse_options()
-    paths = map(normalize_directory_path, options.paths)
     if options.revision:
-        files = get_changed_files(options.revision, paths, options.verbose)
+        files = get_changed_files(options.revision, options.paths, options.verbose)
     else:
-        files = get_all_files(paths)
-    if options.no_recursive:
-        files = remove_recursive_files(files, paths, options.verbose)
+        files = get_all_files(options.paths)
     file_patterns = get_file_patterns(options.glob, options.regex)
-    files = filter_files(files, file_patterns, options.verbose)
+    files = filter_files(files, file_patterns)
 
     # clang-tidy error's when it does not get input files.
     if not files:
diff --git a/tools/cpp_build/build_caffe2.sh b/tools/cpp_build/build_caffe2.sh
index 6a50c14e05523e..b35435acb388c6 100755
--- a/tools/cpp_build/build_caffe2.sh
+++ b/tools/cpp_build/build_caffe2.sh
@@ -24,7 +24,6 @@ cmake -DUSE_CUDA:BOOL=$USE_CUDA \
       -DCMAKE_BUILD_TYPE:STRING=$BUILD_TYPE \
       -DCMAKE_INSTALL_PREFIX:STRING=$INSTALL_PREFIX \
       -DCMAKE_INSTALL_MESSAGE=NEVER \
-      -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \
       -G "$GENERATE" \
       $PYTORCHPATH/
 $MAKE -j "$JOBS" install
diff --git a/tools/cpp_build/build_libtorch.sh b/tools/cpp_build/build_libtorch.sh
index 6dd9a589cf1074..92a9b9981ed697 100755
--- a/tools/cpp_build/build_libtorch.sh
+++ b/tools/cpp_build/build_libtorch.sh
@@ -24,7 +24,6 @@ cmake -DUSE_CUDA:BOOL=$USE_CUDA \
       -DCMAKE_INSTALL_MESSAGE=NEVER \
       -Dnanopb_BUILD_GENERATOR:BOOL=OFF \
       -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON \
-      -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \
       -DVERBOSE:BOOL=${VERBOSE:-0} \
       -G "$GENERATE" \
       $PYTORCHPATH/torch
diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index 5a76d447ad2498..ad9ad2e05c4f4c 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -52,6 +52,28 @@ def jit_type_of(arg):
         typ = '{}?'.format(typ)
     return typ
 
+# map from aten 'simple_type' to the function that will cast a attribute value
+# to that type
+FROM_ATTRIBUTE = {
+    'Device': 'as_device(node->is(attr::{}))',
+    'IntList': 'std::vector<int64_t>(node->is(attr::{}))',
+    'Layout': 'static_cast<at::Layout>(node->i(attr::{}))',
+    'Scalar': 'Scalar(node->t(attr::{}))',
+    'ScalarType': 'static_cast<at::ScalarType>(node->i(attr::{}))',
+    'Tensor': 'node->t(attr::{})',
+    'bool': 'bool(node->i(attr::{}))',
+    'double': 'node->f(attr::{})',
+    'int64_t': 'node->i(attr::{})',
+    'std::array<bool,2>': 'as_bool_array<2>(node->is(attr::{}))',
+    'std::array<bool,3>': 'as_bool_array<3>(node->is(attr::{}))',
+    'std::array<bool,4>': 'as_bool_array<4>(node->is(attr::{}))',
+}
+
+
+def from_attribute(arg):
+    simple_type = arg['simple_type']
+    return FROM_ATTRIBUTE[simple_type].format(arg['name'])
+
 
 # map from aten 'simple_type' to the function that will turn a tensor into
 # that type
@@ -62,7 +84,6 @@ def jit_type_of(arg):
     'Scalar': '{}.toScalar()',
     'ScalarType': 'static_cast<at::ScalarType>({}.toInt())',
     'Tensor': '{}.toTensor()',
-    'TensorList': '{}.toTensorList()->elements()',
     'bool': 'bool({}.toInt())',
     'double': '{}.toDouble()',
     'int64_t': '{}.toInt()',
@@ -77,13 +98,15 @@ def from_ivalue(arg, value):
     return FROM_IVALUE[simple_type].format(value)
 
 
+KW_ACCESS = CodeTemplate("""(node->${method}(Symbol::attr("${name}")))""")
+
 CALL_NAMESPACE = CodeTemplate("""\
 auto result = at::${name}(
     ${args}
 );
 """)
 CALL_METHOD = CodeTemplate("""\
-DeviceGuard device_guard(deviceForInputs(stack, ${num_inputs}));
+DeviceGuard device_guard(deviceForInputs(stack, ${num_dynamic_inputs}));
 auto result = (${first}).${name}(
     ${args}
 );
@@ -99,20 +122,24 @@ def from_ivalue(arg, value):
 );
 """)
 
+# TODO (apaszke): remove the attributed codepath once we remove them
 CONSTRUCTOR = CodeTemplate("""\
-[](Stack & stack) {
+[](Node *node) {
+  ${kw_assignments}
+  return Operation([=](Stack & stack) {
     autograd::profiler::RecordFunction record("${name}");
     ${call}
-    drop(stack, ${num_inputs});
+    drop(stack, ${num_dynamic_inputs});
     pack(stack, std::move(result));
     return 0;
+  });
 }
 """)
 
 OPERATOR = CodeTemplate("""\
 Operator(
     "${signature}",
-    ${op}
+    ${ops}
 ),
 """)
 
@@ -144,6 +171,9 @@ def is_jit_op(decl):
     # we currently only support vararg tensor lists when they are the _first_ argument
     # and the only tensor argument
     arguments = decl['arguments']
+    # Only support a single TensorList arg
+    if sum(arg['simple_type'] == 'TensorList' for arg in arguments) > 1:
+        return False
 
     return ((not decl['api_name'].endswith('_') or is_magic_method(decl['api_name'])) and
             not decl['name'].endswith('_out') and
@@ -167,7 +197,7 @@ def gen_jit_dispatch(declarations, out, template_path):
 
     ops = []
 
-    def get_invocation(decl, args, num_inputs):
+    def get_invocation(decl, args, num_dynamic_inputs):
 
         # because the arg list can get lengthy we put them on a separate line
         def pack_arguments(args):
@@ -181,36 +211,109 @@ def pack_arguments(args):
         elif 'namespace' in decl['method_of']:
             return CALL_NAMESPACE.substitute(name=decl['name'],
                                              args=pack_arguments(args),
-                                             num_inputs=num_inputs)
+                                             num_dynamic_inputs=num_dynamic_inputs)
         else:
             return CALL_METHOD.substitute(
                 name=decl['name'], first=args[0], args=pack_arguments(args[1:]),
-                num_inputs=num_inputs)
+                num_dynamic_inputs=num_dynamic_inputs)
 
-    def emit_decl_variant(decl):
+    def emit_decl_variant(decl, is_positional_arg, has_tensorlist):
+        # is_positional_arg is a boolean list the same length as decl['arguments']
+        # that indicates if the argument should come from the postional list
+        # of inputs. If false, the argument comes from the constant attributes
         kw_assignments = []
         arguments = []
-        num_inputs = len(decl['arguments'])
+
+        if has_tensorlist:
+            kw_assignments.append('size_t varargs_length = node->inputs().size();')
+            # arguments look like: [tensor list], arg1, arg2, arg3
+            # we use peek(<i>, static_inputs) to read the non-vararg inputs
+            # from the end of the stack
+            static_inputs = sum(is_positional_arg) - 1
+            num_dynamic_inputs = 'varargs_length'
+            tensorlist_idx = [i for i, arg in enumerate(decl['arguments']) if arg['simple_type'] == 'TensorList'][0]
+        else:
+            static_inputs = sum(is_positional_arg)
+            num_dynamic_inputs = static_inputs
 
         real_inputs = 0
-        for arg in decl['arguments']:
-            if arg['simple_type'] in default_only_types:
+        for i, arg in enumerate(decl['arguments']):
+            # This conditional allows us to process argument lists with a flattened argument list
+            # with a single TensorList. Given the sequence of arguments:
+            # a b c [d e f g] h i # [] is the list
+            #
+            # 1. For the section where we are processing positional inputs before the
+            #    TensorList:
+            #    a b c [d e f g] h i # [] is the list
+            #    ~~~~~~~~~~~~ <- N
+            #   we set this view_length to the total number of varargs inputs (i.e. the length)
+            #   of the whole argument list. This means that indexing into the list using peek()
+            #   we will retrieve arguments ar their true indices (i.e. peek at 0 points to a,
+            #   1 points to b, etc...). Similarly, we can use peekSlice() to index into the
+            #   list itself this way.
+            # 2. After the list:
+            #    a b c [d e f g] h i # [] is the list
+            #                 ~~~~~~ <- N
+            #   Here we set the view length to static_inputs. In our example,
+            #   we effectively ignore the fact that we have a list here. What is
+            #   significant is that our index i is equivalent when the view length
+            #   is right-justified, whether we have the list or not. Concretely,
+            #   indexing h or i from `a b c [d e f g] h i` is equvalent to indexing
+            #   h or i from `a b c h i`.
+            view_length = 'varargs_length' if has_tensorlist and i < tensorlist_idx else static_inputs
+
+            if arg['simple_type'] == 'TensorList':
+                # NOTE: don't advance real_inputs here. After this we are going
+                # to switch over to indexing from the end as if we only had
+                # the static arguments.
+                arguments.append('toTensors(peekSlice(stack, {}, varargs_length - {}, varargs_length))'
+                                 .format(real_inputs, static_inputs))
+            elif arg['simple_type'] in default_only_types:
                 arguments.append(arg['default'])
-            else:
-                value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, num_inputs)
+            elif is_tensor_arg(arg) or is_positional_arg[i]:
+                value = '(std::move(peek(stack, {}, {})))'.format(real_inputs, view_length)
                 arguments.append(from_ivalue(arg, value))
                 real_inputs += 1
+            else:
+                assign = "auto {} = {};".format(arg['name'], from_attribute(arg))
+                kw_assignments.append(assign)
+                arguments.append(arg['name'])
 
-        call = get_invocation(decl, arguments, num_inputs)
+        call = get_invocation(decl, arguments, num_dynamic_inputs)
 
         returns = decl['returns']
+        all_scalars = all(r['dynamic_type'] != 'TensorList' for r in returns)
 
         constructor = CONSTRUCTOR.substitute(name=decl['name'],
                                              call=call,
                                              kw_assignments=kw_assignments,
-                                             num_inputs=num_inputs)
+                                             num_dynamic_inputs=num_dynamic_inputs)
         return constructor
 
+    def emit_decl(decl):
+        arguments = decl['arguments']
+        has_tensorlist = any(arg['simple_type'] == 'TensorList' for arg in arguments)
+        num_tensor_args = sum(map(is_tensor_arg, arguments))
+
+        # Right now, we generate dispatch methods that either take all non-tensor arguments
+        # as attributes, or don't use any attributes at all. In the future we might want to
+        # have something in the middle too (might be useful for e.g. constant propagation
+        # into attributes, as that would allow us to avoid reparsing tensors into scalar
+        # args at every invocation).
+
+        all_real_arguments_are_inputs = tuple(arg['simple_type'] not in default_only_types for arg in arguments)
+        only_tensors_are_inputs = tuple(is_tensor_arg(arg) for arg in arguments)
+
+        variants = [emit_decl_variant(decl, all_real_arguments_are_inputs, has_tensorlist)]
+        # in some cases there are no inputs that are possibly attributes, so the
+        # variants are actually the same. If so avoid generating both to save compilation
+        # time.
+        if all_real_arguments_are_inputs != only_tensors_are_inputs:
+            variants += [',', emit_decl_variant(decl, only_tensors_are_inputs, has_tensorlist)]
+
+        ops.append(OPERATOR.substitute(signature=signature(decl),
+                                       ops=variants))
+
     # This function declares an order on declarations. This is necessary because
     # there is some ambiguity in the choice of overload: if an argument is overloaded
     # to accept both Scalar and Tensor, the schema with the Tensor should come first
@@ -273,8 +376,7 @@ def declkey(decl):
 
     jit_decls = sort_decls(jit_decls)
     for decl in jit_decls:
-        ops.append(OPERATOR.substitute(signature=signature(decl),
-                                       op=emit_decl_variant(decl)))
+        emit_decl(decl)
 
     # Sort the generated snippets to ensure that the generation is deterministic
     env = {
diff --git a/tools/jit/templates/register_aten_ops.cpp b/tools/jit/templates/register_aten_ops.cpp
index 3dc973463d6e90..06ad9c2840b1cc 100644
--- a/tools/jit/templates/register_aten_ops.cpp
+++ b/tools/jit/templates/register_aten_ops.cpp
@@ -29,6 +29,7 @@ using autograd::Variable;
 using autograd::variable_list;
 using at::Scalar;
 using at::Tensor;
+using at::TensorList;
 using at::TensorOptions;
 using at::DeviceGuard;
 
@@ -41,20 +42,26 @@ int deviceForInputs(Stack & stack, size_t N) {
   return t.type().is_cuda() ? (int) t.get_device() : -1;
 }
 
+std::vector<at::Tensor> toTensors(at::ArrayRef<IValue> ivalues) {
+  return fmap(ivalues, [](const IValue& v) {
+    return v.toTensor();
+  });
+}
+
 template<size_t N>
-std::array<bool, N> as_bool_array(at::ArrayRef<int64_t> vec) {
+std::array<bool, N> as_bool_array(const std::vector<int64_t>& vec) {
   std::array<bool, N> res;
   JIT_ASSERT(vec.size() == N);
   std::copy(vec.begin(), vec.end(), res.begin());
   return res;
 }
 
-at::Device as_device(ArrayRef<int64_t> elements) {
+at::Device as_device(const std::vector<int64_t>& elements) {
   return at::Device(static_cast<at::Device::Type>(elements[0]), elements[1]);
 }
 
 RegisterOperators reg({
-  ${constructors}
+${constructors}
 });
 
 } // anon namespace
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 057bf6efeac3dd..88546fda7ed604 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -102,7 +102,6 @@ add_custom_command(
   "${TOOLS_PATH}/autograd/gen_autograd.py"
   "${TOOLS_PATH}/autograd/gen_autograd_functions.py"
   "${TOOLS_PATH}/autograd/gen_variable_type.py"
-  "${TOOLS_PATH}/jit/gen_jit_dispatch.py"
   "${TOOLS_PATH}/jit/templates/register_aten_ops.cpp"
   "${TOOLS_PATH}/jit/templates/aten_interned_strings.h"
   WORKING_DIRECTORY "${TORCH_SRC_DIR}/..")
@@ -139,7 +138,6 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/operator.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/batch_mm.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/canonicalize.cpp
-  ${TORCH_SRC_DIR}/csrc/jit/passes/constant_propagation.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/common_subexpression_elimination.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/create_autodiff_subgraphs.cpp
   ${TORCH_SRC_DIR}/csrc/jit/passes/dead_code_elimination.cpp
@@ -163,6 +161,8 @@ set(TORCH_SRCS
   ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp
   ${TORCH_SRC_DIR}/csrc/jit/tracer.cpp
   ${TORCH_SRC_DIR}/csrc/jit/type.cpp
+  ${TORCH_SRC_DIR}/csrc/onnx/onnx.cpp
+  ${TORCH_SRC_DIR}/csrc/onnx/onnx.npb.cpp
   ${TORCH_SRC_DIR}/csrc/torch.cpp
   ${TORCH_SRC_DIR}/csrc/utils/tensor_flatten.cpp
   ${TORCH_SRC_DIR}/csrc/utils/variadic.cpp
@@ -267,12 +267,6 @@ if(OPENMP_FOUND)
   target_link_libraries(torch -fopenmp)
 endif()
 
-if (NOT NO_API AND NOT USE_ROCM)
-  target_include_directories(torch PUBLIC
-    ${TORCH_SRC_DIR}/csrc/api
-    ${TORCH_SRC_DIR}/csrc/api/include)
-endif()
-
 if(USE_CUDA)
   if(MSVC)
     set(TORCH_CUDA_LIBRARIES
@@ -371,7 +365,7 @@ install(TARGETS torch
   ARCHIVE DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 
 # JIT Tests. TODO: Put into test/cpp/jit folder
-if (BUILD_TORCH_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
+if (NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
   add_executable(test_jit ${TORCH_SRC_DIR}/csrc/jit/test_jit.cpp)
   target_link_libraries(test_jit torch ${TORCH_CUDA_LIBRARIES})
   target_compile_definitions(test_jit PUBLIC USE_CATCH _FORCE_INLINES)
@@ -385,6 +379,10 @@ if (BUILD_TORCH_TEST AND NOT MSVC AND NOT APPLE AND NOT USE_ROCM)
 endif()
 
 if (BUILD_TORCH_TEST AND NOT NO_API AND NOT USE_ROCM)
+  target_include_directories(torch PUBLIC
+    ${TORCH_SRC_DIR}/csrc/api
+    ${TORCH_SRC_DIR}/csrc/api/include)
+
   set(TORCH_API_TEST_DIR "${TORCH_SRC_DIR}/../test/cpp/api")
 
   add_executable(test_api
diff --git a/torch/__init__.py b/torch/__init__.py
index a40111bcca6b02..3fbb0b76fcc386 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -298,8 +298,3 @@ def manager_path():
 # attach docstrings to torch and tensor functions
 from . import _torch_docs, _tensor_docs, _storage_docs
 del _torch_docs, _tensor_docs, _storage_docs
-
-
-def compiled_with_cxx11_abi():
-    r"""Returns whether PyTorch was built with _GLIBCXX_USE_CXX11_ABI=1"""
-    return _C._GLIBCXX_USE_CXX11_ABI
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index af367c3e544905..2194310a46d522 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -402,6 +402,16 @@ PyObject *THPModule_isDefaultTypeCuda(PyObject *_unused, PyObject *arg) {
   END_HANDLE_TH_ERRORS
 }
 
+PyObject *THPModule_useZeroSizeDim(PyObject *_unused, PyObject *arg) {
+  HANDLE_TH_ERRORS
+#ifdef USE_TH_SIZE_ZERO_DIM
+  Py_RETURN_TRUE;
+#else
+  Py_RETURN_FALSE;
+#endif
+  END_HANDLE_TH_ERRORS
+}
+
 static PyMethodDef TorchMethods[] = {
   {"_initExtension",  (PyCFunction)THPModule_initExtension,   METH_O,       NULL},
   {"_autograd_init",  (PyCFunction)THPAutograd_initExtension, METH_NOARGS,  NULL},
@@ -432,6 +442,7 @@ static PyMethodDef TorchMethods[] = {
   {"set_flush_denormal", (PyCFunction)THPModule_setFlushDenormal, METH_O,     NULL},
   {"get_default_dtype", (PyCFunction)THPModule_getDefaultDtype, METH_NOARGS,  NULL},
   {"_is_default_type_cuda", (PyCFunction)THPModule_isDefaultTypeCuda, METH_NOARGS,  NULL},
+  {"_use_zero_size_dim", (PyCFunction)THPModule_useZeroSizeDim, METH_NOARGS,  NULL},
   {NULL, NULL, 0, NULL}
 };
 
@@ -613,13 +624,6 @@ static PyObject* initModule() {
 
   ASSERT_TRUE(PyModule_AddObject(module, "has_mkl", at::hasMKL() ? Py_True : Py_False) == 0);
 
-#ifdef _GLIBCXX_USE_CXX11_ABI
-  ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI",
-        _GLIBCXX_USE_CXX11_ABI ? Py_True : Py_False) == 0);
-#else
-  ASSERT_TRUE(PyModule_AddObject(module, "_GLIBCXX_USE_CXX11_ABI", Py_False) == 0);
-#endif
-
   auto& defaultGenerator = at::globalContext().defaultGenerator(at::kCPU);
   THPDefaultGenerator = (THPGenerator*)THPGenerator_NewWithGenerator(
     defaultGenerator);
diff --git a/torch/csrc/api/include/torch/nn/cursor.h b/torch/csrc/api/include/torch/nn/cursor.h
index 2ae5c5d93752c1..c0f56eea72fbd0 100644
--- a/torch/csrc/api/include/torch/nn/cursor.h
+++ b/torch/csrc/api/include/torch/nn/cursor.h
@@ -48,7 +48,7 @@ class CursorBase {
 
   /// A `(key, value)` pair exposed by cursor iterators.
   struct Item {
-    Item(const std::string& key_, T& value_);
+    Item(const std::string& key_, T& module_);
 
     T& operator*();
     const T& operator*() const;
diff --git a/torch/csrc/autograd/anomaly_mode.h b/torch/csrc/autograd/anomaly_mode.h
index 1f12f0a65c7460..7327d03f11b887 100644
--- a/torch/csrc/autograd/anomaly_mode.h
+++ b/torch/csrc/autograd/anomaly_mode.h
@@ -18,7 +18,7 @@ struct AnomalyMode {
 
 
 struct AnomalyMetadata {
-  virtual ~AnomalyMetadata() = default;
+  virtual ~AnomalyMetadata(){};
   virtual void store_stack() = 0;
   virtual void print_stack() = 0;
 };
diff --git a/torch/csrc/autograd/aten_variable_hooks.cpp b/torch/csrc/autograd/aten_variable_hooks.cpp
index 2f3899e4f8b59a..7a2c3974c2227c 100644
--- a/torch/csrc/autograd/aten_variable_hooks.cpp
+++ b/torch/csrc/autograd/aten_variable_hooks.cpp
@@ -6,7 +6,6 @@ namespace torch { namespace autograd {
 struct VariableHooks : public at::VariableHooksInterface {
   VariableHooks(at::VariableHooksArgs) {}
   void registerVariableTypeFor(at::Context*, at::Backend, at::ScalarType) const override;
-  at::Type& getVariableType(const at::Type&) const override;
 };
 
 // Sigh, the registry doesn't support namespaces :(
@@ -21,8 +20,4 @@ void VariableHooks::registerVariableTypeFor(at::Context* context, at::Backend ba
   register_variable_type_for(baseType);
 }
 
-at::Type& VariableHooks::getVariableType(const at::Type& baseType) const {
-  return *VariableType::getType(baseType);
-}
-
 }} // torch::autograd
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 74e15f5caefe9d..8309ba1ce1038c 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -159,7 +159,7 @@ struct GraphTask {
   std::unordered_map<Function*, ExecInfo> exec_info;
   std::vector<Variable> captured_vars;
 
-  void init_to_execute(Function& graph_root, const edge_list& outputs);
+  void init_to_execute(Function& graph_root, const edge_list& captures);
 
   // The value of worker_device in the thread that created this task.
   // See Note [Reentrant backwards]
@@ -499,14 +499,14 @@ struct ClearCallbacks {
   std::mutex& callbacks_lock;
 };
 
-auto Engine::execute(const edge_list& roots,
+auto Engine::execute(const edge_list& input_roots,
                      const variable_list& inputs,
                      bool keep_graph,
                      bool create_graph,
                      const edge_list& outputs) -> variable_list {
   std::call_once(start_threads_flag, &Engine::start_threads, this);
 
-  validate_outputs(roots, const_cast<variable_list&>(inputs), [](const std::string& msg) {
+  validate_outputs(input_roots, const_cast<variable_list&>(inputs), [](const std::string& msg) {
     return msg;
   });
 
@@ -517,7 +517,7 @@ auto Engine::execute(const edge_list& roots,
   std::unique_lock<std::mutex> lock(graph_task.mutex);
 
   // Now compute the dependencies for all executable functions and queue the root
-  auto graph_root = std::make_shared<GraphRoot>(roots, inputs);
+  auto graph_root = std::make_shared<GraphRoot>(input_roots, inputs);
   compute_dependencies(graph_root.get(), graph_task);
   if (!outputs.empty()) {
     graph_task.init_to_execute(*graph_root, outputs);
diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h
index 94490303ccc240..db8b3357ac2536 100644
--- a/torch/csrc/autograd/engine.h
+++ b/torch/csrc/autograd/engine.h
@@ -57,7 +57,7 @@ struct TORCH_API Engine {
   ReadyQueue& ready_queue(int device);
   void start_threads();
   virtual void thread_init(int device);
-  virtual void thread_main(GraphTask *graph_task);
+  virtual void thread_main(GraphTask *task);
   virtual void thread_on_exception(FunctionTask& task, std::exception& e);
 
   std::once_flag start_threads_flag;
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index 46a80b90b29ffa..b02bdf3928f2ff 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -328,7 +328,7 @@ struct TORCH_API Function : std::enable_shared_from_this<Function> {
 /// See Function::is_traceable() for definition.
 struct TraceableFunction : public Function {
   using Function::Function;
-  bool is_traceable() final {
+  bool is_traceable() final override {
     return true;
   }
 };
diff --git a/torch/csrc/autograd/function_hook.h b/torch/csrc/autograd/function_hook.h
index f3cf5b2e793c6a..03c52fea54535c 100644
--- a/torch/csrc/autograd/function_hook.h
+++ b/torch/csrc/autograd/function_hook.h
@@ -10,12 +10,12 @@ struct Variable;
 using variable_list = std::vector<Variable>;
 
 struct FunctionPreHook {
-  virtual ~FunctionPreHook() = default;
+  virtual ~FunctionPreHook() {}
   virtual variable_list operator()(const variable_list& grads) = 0;
 };
 
 struct FunctionPostHook {
-  virtual ~FunctionPostHook() = default;
+  virtual ~FunctionPostHook() {}
   virtual variable_list operator()(const variable_list& grad_input, const variable_list& grad_output) = 0;
 };
 
diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h
index db86ae428d4060..44d4b7f106c860 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.h
+++ b/torch/csrc/autograd/functions/accumulate_grad.h
@@ -6,9 +6,9 @@
 namespace torch { namespace autograd {
 
 struct AccumulateGrad : public Function {
-  explicit AccumulateGrad(Variable variable_);
+  explicit AccumulateGrad(Variable variable);
 
-  variable_list apply(variable_list&& grads) override;
+  variable_list apply(variable_list&& inputs) override;
 
   Variable variable;
 };
diff --git a/torch/csrc/autograd/functions/basic_ops.cpp b/torch/csrc/autograd/functions/basic_ops.cpp
index c4a54d99d08702..b04b0f25ca42d5 100644
--- a/torch/csrc/autograd/functions/basic_ops.cpp
+++ b/torch/csrc/autograd/functions/basic_ops.cpp
@@ -11,7 +11,7 @@
 
 namespace torch { namespace autograd {
 
-auto Error::apply(variable_list&& inputs) -> variable_list {
+auto Error::apply(variable_list&& grad_outputs) -> variable_list {
   throw std::runtime_error(msg);
 }
 
diff --git a/torch/csrc/autograd/functions/tensor.h b/torch/csrc/autograd/functions/tensor.h
index 1a21a360ba9fc2..aa4b422136930f 100644
--- a/torch/csrc/autograd/functions/tensor.h
+++ b/torch/csrc/autograd/functions/tensor.h
@@ -13,7 +13,7 @@
 namespace torch { namespace autograd {
 
 struct CopyBackwards : public Function {
-  variable_list apply(variable_list&& grads) override;
+  variable_list apply(variable_list&& inputs) override;
 
   at::Type *src_type;
   int32_t src_device = -1;
@@ -23,12 +23,9 @@ struct CopyBackwards : public Function {
 // grad[idx] is defined by the relative sizes, strides, and offset of base and
 // view.
 struct CopySlices : public Function {
-  CopySlices(
-      const Variable& base_var,
-      at::TensorGeometry view_,
-      std::shared_ptr<Function> fn_);
+  CopySlices(const Variable& base, at::TensorGeometry view, std::shared_ptr<Function> fn);
 
-  variable_list apply(variable_list&& inputs) override;
+  variable_list apply(variable_list&& grads) override;
   void release_variables() override;
 
   at::TensorGeometry base;
diff --git a/torch/csrc/autograd/input_buffer.h b/torch/csrc/autograd/input_buffer.h
index f1c02e0d78e565..2e0febfc84b0bc 100644
--- a/torch/csrc/autograd/input_buffer.h
+++ b/torch/csrc/autograd/input_buffer.h
@@ -22,14 +22,14 @@ struct InputBuffer {
   InputBuffer& operator=(InputBuffer&& other) = default;
 
   // Accumulates the variable at a specified index.
-  void add(size_t pos, Variable var);
+  void add(size_t idx, Variable var);
 
   int device() const;
 
   Variable operator[](size_t pos) { return buffer[pos]; }
 
   // Returns the inputs as a list of variables. Destroys given InputBuffer.
-  static std::vector<Variable> variables(InputBuffer&& g);
+  static std::vector<Variable> variables(InputBuffer&& buffer);
 
 private:
   std::vector<Variable> buffer;
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index ba0fee1510baa2..dd77dc193ba9bd 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -185,7 +185,7 @@ struct TORCH_API RecordFunction {
 using thread_event_lists = std::vector<std::vector<Event>>;
 // NOTE: changing profiler modes is **NOT THREAD SAFE**. You should ensure that
 // there no autograd functions are being executed when these function are used.
-TORCH_API void enableProfiler(ProfilerState new_state);
+TORCH_API void enableProfiler(ProfilerState state);
 TORCH_API thread_event_lists disableProfiler();
 
 } // namespace profiler
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index e9d29bd0caa688..08e494530040eb 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -45,7 +45,7 @@ namespace torch { namespace autograd {
 
 VariableInfo::VariableInfo(const Variable& var)
   : type(&var.type())
-  , size(var.sizes().vec())
+  , size(var.sizes())
   , requires_grad(var.requires_grad()) {
   if (var.type().is_cuda()) {
     device = var.get_device();
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index 1aa21f84d45cf2..cd8329cad01434 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -154,6 +154,14 @@ static Variable applySlicing(const Variable& self, PyObject* index, variable_lis
       result = applySelect(result, dim, THPUtils_unpackLong(obj));
     } else if (PySlice_Check(obj)) {
       result = applySlice(result, dim, obj);
+#ifndef USE_TH_SIZE_ZERO_DIM
+      if (result.numel() == 0) {
+        // TODO: currently we don't have support for 0-sized dims, so slicing a dim
+        // to size 0 will return a size 0 tensor. for now, just shortcircuit slicing
+        // and return that size 0 tensor.
+        return result;
+      }
+#endif
       dim++;
     } else if (obj == Py_Ellipsis) {
       dim += self.dim() - specified_dims;
diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h
index 037f06a7f95c11..61a1d3b3eac172 100644
--- a/torch/csrc/autograd/saved_variable.h
+++ b/torch/csrc/autograd/saved_variable.h
@@ -45,10 +45,10 @@ class TORCH_API SavedVariable {
   std::weak_ptr<Function> grad_accumulator_;
   VariableVersion version_counter_;
 
-  uint32_t saved_version_ = 0;
-  uint32_t output_nr_ = 0;
+  uint32_t saved_version_;
+  uint32_t output_nr_;
   bool was_default_constructed_ = true;
-  bool requires_grad_ = false;
-  bool has_grad_fn_ = false;
+  bool requires_grad_;
+  bool has_grad_fn_;
 };
 }} // namespace torch::autograd
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index 30aded0a85e73a..9bbae25d9c4d96 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -22,7 +22,7 @@
 namespace torch {
 namespace autograd {
 Variable::Impl::Impl(at::Tensor data, bool requires_grad, Edge gradient_edge)
-    : TensorImpl(data.type().backend(), data.type().scalarType(), nullptr, /* is variable */ true),
+    : TensorImpl(VariableType::getType(data), nullptr),
       data_(std::move(data)),
       grad_fn_(std::move(gradient_edge.function)),
       requires_grad_(false),
@@ -118,9 +118,7 @@ void Variable::Impl::backward(
 
 void Variable::Impl::set_data(Tensor new_data) {
   if (new_data.type() != data_.type()) {
-    scalar_type_ = new_data.type().scalarType();
-    backend_ = new_data.type().backend();
-    is_variable_ = true;
+    type_ = VariableType::getType(new_data.type());
     // Clear grad_accumulator if it exists, since it stores the old type info.
     grad_accumulator_.reset();
   }
@@ -156,8 +154,8 @@ std::shared_ptr<Function>& Variable::ViewImpl::get_grad_fn() {
     AT_ASSERT(output_nr_ == 0);
     auto fn = std::make_shared<generated::AsStridedBackward>();
     fn->self_geometry = at::TensorGeometry(base_);
-    fn->size = sizes().vec();
-    fn->stride = strides().vec();
+    fn->size = sizes();
+    fn->stride = strides();
     fn->storage_offset = data_.storage_offset();
     fn->set_next_edges(collect_next_edges(base_));
     fn->add_input_metadata(base_.type(), sizes());
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index d46008bbdd10b0..c97a0322359a4d 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -263,7 +263,7 @@ struct Variable::Impl : public at::TensorImpl {
   TORCH_API explicit Impl(
       at::Tensor data,
       bool requires_grad = false,
-      Edge gradient_edge = Edge());
+      Edge edge = Edge());
 
   ~Impl() override;
 
@@ -327,6 +327,9 @@ struct Variable::Impl : public at::TensorImpl {
   /// Reset all expensive fields to free up resources
   void release_resources() override;
 
+  // Make this field public so we can access it from `Variable`.
+  using at::TensorImpl::type_;
+
   std::string name;
   at::Tensor data_;
 
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index 8237239f99b639..0e869876e8e1fa 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -74,7 +74,7 @@ tensor_list2d broadcast_coalesced(TensorList tensors, IntList devices, size_t bu
   }
 
   tensor_list2d outputs(devices.size());
-  outputs[0] = tensors.vec();
+  outputs[0] = tensors;
   for (auto & o : outputs)
     o.reserve(tensors.size());
 
diff --git a/torch/csrc/distributed/c10d/ddp.h b/torch/csrc/distributed/c10d/ddp.h
deleted file mode 100644
index 7b26c1475fc1c6..00000000000000
--- a/torch/csrc/distributed/c10d/ddp.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#pragma once
-
-#include <torch/csrc/utils/tensor_flatten.h>
-
-#include <c10d/ProcessGroup.hpp>
-
-#include <ATen/ATen.h>
-
-#include <cstddef>
-#include <memory>
-#include <vector>
-
-namespace c10d {
-inline void distBroadcastCoalesced(
-    std::vector<at::Tensor>& tensors,
-    int64_t bufferSize,
-    ProcessGroup& processGroup) {
-  auto tensorGroups = torch::utils::take_tensors(tensors, bufferSize);
-  // We store single-element vectors in `flatTensors` because
-  // `ProcessGroup::broadcast` takes a reference to a vector, which must be
-  // alive until the `wait()` call on the returned `Work` completes.
-  std::vector<std::vector<at::Tensor>> flatTensors;
-  std::vector<std::shared_ptr<ProcessGroup::Work>> work;
-  flatTensors.reserve(tensorGroups.size());
-  work.reserve(tensorGroups.size());
-  for (const auto& group : tensorGroups) {
-    // Flatten each group of tensors (whose size equals `bufferSize`) into a
-    // single tensor.
-    flatTensors.push_back({torch::utils::flatten_dense_tensors(group.tensors)});
-    BroadcastOptions broadcastOptions;
-    broadcastOptions.rootRank = 0;
-    broadcastOptions.rootTensor = 0;
-    // Enqueue a work item and collect the `Work` (essntially a "future") so we
-    // can `wait()` for its completion after we have collected all `Work` items.
-    work.push_back(
-        processGroup.broadcast(flatTensors.back(), broadcastOptions));
-  }
-  // Now loop through each group, wait for the broadcast to complete, and
-  // un-flatten the broadcast tensor back into device-local individual tensors.
-  for (size_t group = 0; group < tensorGroups.size(); ++group) {
-    auto& tensors = tensorGroups[group].tensors;
-    work[group]->wait();
-    const auto synced =
-        torch::utils::unflatten_dense_tensors(flatTensors[group][0], tensors);
-    AT_ASSERT(synced.size() == tensors.size());
-    for (size_t i = 0; i < synced.size(); ++i) {
-      // Copy into the per-process tensors.
-      tensors[i].copy_(synced[i], /*non_blocking=*/true);
-    }
-  }
-}
-} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 797fcbcdd2432e..2bd7a871dc36fc 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -13,10 +13,9 @@
 #include <gloo/transport/tcp/device.h>
 #include <pybind11/chrono.h>
 
-#include <torch/csrc/Exceptions.h>
-#include <torch/csrc/distributed/c10d/ddp.h>
-#include <torch/csrc/utils/object_ptr.h>
-#include <torch/csrc/utils/pybind.h>
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/object_ptr.h"
+#include "torch/csrc/utils/pybind.h"
 
 namespace torch {
 namespace distributed {
@@ -200,8 +199,6 @@ PyObject* c10d_init(PyObject* _unused) {
           &::c10d::ProcessGroup::Work::wait,
           py::call_guard<py::gil_scoped_release>());
 
-  module.def("_dist_broadcast_coalesced", &::c10d::distBroadcastCoalesced);
-
   Py_RETURN_TRUE;
 }
 
diff --git a/torch/csrc/jit/argument_spec.h b/torch/csrc/jit/argument_spec.h
index f404b4ce9a05c6..d6bd90cb708784 100644
--- a/torch/csrc/jit/argument_spec.h
+++ b/torch/csrc/jit/argument_spec.h
@@ -59,21 +59,20 @@ struct ArgumentSpec {
     for(int32_t i = 0; i < num_inputs; i++) {
       auto & pod = pods[i];
       pod.is_tensor = static_cast<uint32_t>(inputs[i].isTensor());
-      if (pod.is_tensor) {
-        at::Tensor t = inputs[i].toTensor();
-        pod.defined = t.defined();
-        if (pod.defined) {
-          pod.type = static_cast<int>(t.type().scalarType());
-          pod.device = (!t.type().is_cuda()) ? -1 : t.get_device();
-          pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad();
-          total_dims += t.ndimension();
-          auto sizes = t.sizes();
-          std::copy(sizes.begin(),sizes.end(), next_dim);
-          next_dim += sizes.size();
-          auto strides = t.strides();
-          std::copy(strides.begin(), strides.end(), next_dim);
-          next_dim += strides.size();
-        }
+      if (!pod.is_tensor) continue;
+      at::Tensor t = inputs[i].toTensor();
+      pod.defined = t.defined();
+      if (pod.defined) {
+        pod.type = static_cast<int>(t.type().scalarType());
+        pod.device = (!t.type().is_cuda()) ? -1 : t.get_device();
+        pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad();
+        total_dims += t.ndimension();
+        auto sizes = t.sizes();
+        std::copy(sizes.begin(),sizes.end(), next_dim);
+        next_dim += sizes.size();
+        auto strides = t.strides();
+        std::copy(strides.begin(), strides.end(), next_dim);
+        next_dim += strides.size();
       }
       // each POD has a running tally of all dimensions including its own
       pod.total_dims = total_dims;
diff --git a/torch/csrc/jit/attributes.h b/torch/csrc/jit/attributes.h
index 53b87af9ef991d..f69790cab52e00 100644
--- a/torch/csrc/jit/attributes.h
+++ b/torch/csrc/jit/attributes.h
@@ -28,7 +28,7 @@ struct AttributeValue {
   Symbol name;
   virtual AttributeKind kind() const = 0;
   virtual Ptr clone() const = 0;
-  virtual ~AttributeValue() = default;
+  virtual ~AttributeValue() {}
 };
 
 template<typename T, AttributeKind Kind>
@@ -101,7 +101,7 @@ struct AttributeError : public std::exception {
 // we return Derived* pointers because Nodes are normally held as pointers.
 template<typename Derived>
 struct Attributes {
-  Attributes() = default;
+  Attributes() {}
   void copyAttributes(const Attributes & rhs) {
     values_.clear();
     for(auto & i : rhs.values_) {
diff --git a/torch/csrc/jit/autodiff.cpp b/torch/csrc/jit/autodiff.cpp
index 7f250bf7c452aa..c830dc45a537f5 100644
--- a/torch/csrc/jit/autodiff.cpp
+++ b/torch/csrc/jit/autodiff.cpp
@@ -9,7 +9,6 @@
 #include <torch/csrc/jit/assertions.h>
 
 #include <algorithm>
-#include <memory>
 
 namespace torch { namespace jit {
 
@@ -565,13 +564,14 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) {
   reverse_block->owningNode()->destroy();
 }
 
-Gradient differentiate(std::shared_ptr<Graph>& graph, const std::vector<bool>& requires_grad) {
+Gradient differentiate(std::shared_ptr<Graph>& _graph, const std::vector<bool>& requires_grad) {
   Gradient grad_desc;
   // Take ownership of the graph
-  JIT_ASSERTM(graph.use_count() == 1,
-              "differentiate will mutate and destroy the graph, so it requires "
-              "graph.use_count() == 1, but found %d", graph.use_count());
-  std::swap(graph, grad_desc.f);
+  JIT_ASSERTM(
+      _graph.use_count() == 1,
+      "differentiate will mutate and destroy the graph, so it requires "
+      "graph.use_count() == 1, but found ", _graph.use_count());
+  std::swap(_graph, grad_desc.f);
   // XXX: Take care when handling outputs - they can be duplicated!
 
   WithInsertPoint guard(grad_desc.f->block());
diff --git a/torch/csrc/jit/autodiff.h b/torch/csrc/jit/autodiff.h
index ea2b7a1170efeb..6dd2be9db0e779 100644
--- a/torch/csrc/jit/autodiff.h
+++ b/torch/csrc/jit/autodiff.h
@@ -4,9 +4,7 @@
 #include "torch/csrc/jit/ir.h"
 
 #include <ATen/ATen.h>
-
 #include <vector>
-#include <memory>
 
 namespace torch { namespace jit {
 
diff --git a/torch/csrc/jit/constants.cpp b/torch/csrc/jit/constants.cpp
index 47e593bbb125e2..3c4ad0c130ea31 100644
--- a/torch/csrc/jit/constants.cpp
+++ b/torch/csrc/jit/constants.cpp
@@ -22,13 +22,8 @@ Value* insertConstant(
     n->f_(attr::value, val.toDouble());
     n->output()->setType(FloatType::get());
   } else if(val.isIntList()) {
-    n->is_(attr::value, val.toIntList()->elements().vec());
+    n->is_(attr::value, val.toIntList()->elements());
     n->output()->setType(ListType::ofInts());
-  } else if(val.isTensorList()) {
-    n->ts_(attr::value, fmap(val.toTensorList()->elements(), [](const at::Tensor & t) {
-      return autograd::Variable(t).data();
-    }));
-    n->output()->setType(ListType::ofTensors());
   } else {
     throw std::runtime_error("Unsupported value kind: " + val.tagKind());
   }
@@ -71,14 +66,6 @@ RegisterOperators reg({
             push(stack, is);
             return 0;
           };
-        } else if(type->isSubtypeOf(ListType::ofTensors())) {
-          auto ts = fmap(node->ts(attr::value), [](const at::Tensor & t) -> at::Tensor {
-            return autograd::make_variable(t);
-          });
-          return [ts](Stack& stack) {
-            push(stack, ts);
-            return 0;
-          };
         } else {
           std::stringstream ss;
           ss << "constant literal not supported for: " << type->str();
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index 20208af5496c28..71dec999c40216 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -1,7 +1,6 @@
 #include "torch/csrc/jit/export.h"
-#include "torch/csrc/autograd/symbolic.h"
-#include "onnx/onnx.pb.h"
 #include "torch/csrc/onnx/onnx.h"
+#include "torch/csrc/autograd/symbolic.h"
 
 #include "torch/csrc/utils/functional.h"
 #include <torch/csrc/jit/assertions.h>
@@ -19,8 +18,7 @@ namespace torch { namespace jit {
 
 namespace {
 
-namespace onnx_torch = ::torch::onnx;
-namespace onnx = ::ONNX_NAMESPACE;
+namespace onnx = ::torch::onnx;
 
 std::string value_name(Value* n) {
   return n->uniqueName();
@@ -28,7 +26,7 @@ std::string value_name(Value* n) {
 
 struct ExportContext {
   size_t num_blocks = 0;
-  onnx_torch::OperatorExportTypes operator_export_type;
+  onnx::OperatorExportTypes operator_export_type;
 };
 
 void encodeGraph(onnx::GraphProto * p_g, const std::shared_ptr<Graph> & g,
@@ -45,37 +43,34 @@ void encodeTensor(onnx::TensorProto * p, const at::Tensor & tensor,
   for(auto d : tensor.sizes()) {
     p->add_dims(d);
   }
-  onnx::TensorProto_DataType onnx_type;
+  onnx::DataType onnx_type;
   // Most integral types and float16 need to be serialized as int32
   at::ScalarType cast_type = tensor.type().scalarType();
   switch(tensor.type().scalarType()) {
     case at::kDouble:
-      onnx_type = onnx::TensorProto_DataType_DOUBLE;
+      onnx_type = onnx::kDOUBLE;
       break;
     case at::kFloat:
-      onnx_type = onnx::TensorProto_DataType_FLOAT;
+      onnx_type = onnx::kFLOAT;
       break;
     case at::kHalf:
-      onnx_type = onnx::TensorProto_DataType_FLOAT16;
+      onnx_type = onnx::kFLOAT16;
       cast_type = at::kInt;
       break;
     case at::kByte:
-      onnx_type = onnx::TensorProto_DataType_UINT8;
-      cast_type = at::kInt;
-      break;
     case at::kChar:
-      onnx_type = onnx::TensorProto_DataType_INT8;
+      onnx_type = onnx::kINT8;
       cast_type = at::kInt;
       break;
     case at::kShort:
-      onnx_type = onnx::TensorProto_DataType_INT16;
+      onnx_type = onnx::kINT16;
       cast_type = at::kInt;
       break;
     case at::kInt:
-      onnx_type = onnx::TensorProto_DataType_INT32;
+      onnx_type = onnx::kINT32;
       break;
     case at::kLong:
-      onnx_type = onnx::TensorProto_DataType_INT64;
+      onnx_type = onnx::kINT64;
       break;
     default:
       AT_ERROR("unexpected tensor scalar type");
@@ -90,14 +85,13 @@ void encodeTensor(onnx::TensorProto * p, const at::Tensor & tensor,
   if (external_ref) {
     // For now, we use the name of the tensor as the external lookup name to
     // avoid ONNX protobuf changes.
-    JIT_ASSERT(external_ref.value() == p->name());
+    JIT_ASSERT(external_ref.value() == p->get_name());
     JIT_ASSERT(raw_data_export_map != nullptr);
     JIT_ASSERT(raw_data_export_map->count(external_ref.value()) == 0);
     (*raw_data_export_map)[external_ref.value()] = t;
-    p->set_raw_data("__EXTERNAL");
+    p->set_external_data_present();
   } else {
-    JIT_ASSERT(t.is_contiguous());
-    p->set_raw_data(std::string(static_cast<char*>(t.data_ptr()),  t.type().elementSizeInBytes() * t.numel()));
+    p->set_raw_data(t);
   }
 }
 
@@ -108,50 +102,50 @@ void addAttribute(onnx::NodeProto * n_p, jit::Node * n, jit::Symbol name, Export
   switch(n->kindOf(name)) {
     case AttributeKind::f:
       attr->set_f(n->f(name));
-      attr->set_type(onnx::AttributeProto_AttributeType_FLOAT);
+      attr->set_type(onnx::aFLOAT);
       break;
     case AttributeKind::fs:
-      attr->set_type(onnx::AttributeProto_AttributeType_FLOATS);
+      attr->set_type(onnx::aFLOATS);
       for(auto & v : n->fs(name))
         attr->add_floats(v);
       break;
     case AttributeKind::i:
-      attr->set_type(onnx::AttributeProto_AttributeType_INT);
+      attr->set_type(onnx::aINT);
       attr->set_i(n->i(name));
       break;
     case AttributeKind::is:
-      attr->set_type(onnx::AttributeProto_AttributeType_INTS);
+      attr->set_type(onnx::aINTS);
       for(auto & v : n->is(name))
         attr->add_ints(v);
       break;
     case AttributeKind::s:
-      attr->set_type(onnx::AttributeProto_AttributeType_STRING);
+      attr->set_type(onnx::aSTRING);
       attr->set_s(n->s(name));
       break;
     case AttributeKind::ss:
-      attr->set_type(onnx::AttributeProto_AttributeType_STRINGS);
+      attr->set_type(onnx::aSTRINGS);
       for(auto & v : n->ss(name))
         attr->add_strings(v);
       break;
     case AttributeKind::t: {
-      attr->set_type(onnx::AttributeProto_AttributeType_TENSOR);
+      attr->set_type(onnx::aTENSOR);
       auto t = attr->mutable_t();
       encodeTensor(t, n->t(name));
     } break;
     case AttributeKind::ts:
-      attr->set_type(onnx::AttributeProto_AttributeType_TENSORS);
+      attr->set_type(onnx::aTENSORS);
       for(auto & v : n->ts(name)) {
         auto t = attr->add_tensors();
         encodeTensor(t, v);
       }
       break;
     case AttributeKind::g: {
-      attr->set_type(onnx::AttributeProto_AttributeType_GRAPH);
+      attr->set_type(onnx::aGRAPH);
       auto g = attr->mutable_g();
       encodeGraph(g, n->g(name), {}, ctx, nullptr);
     } break;
     case AttributeKind::gs:
-      attr->set_type(onnx::AttributeProto_AttributeType_GRAPHS);
+      attr->set_type(onnx::aGRAPHS);
       for(auto & v : n->gs(name)) {
         auto g = attr->add_graphs();
         encodeGraph(g, v, {}, ctx, nullptr);
@@ -160,52 +154,49 @@ void addAttribute(onnx::NodeProto * n_p, jit::Node * n, jit::Symbol name, Export
   }
 }
 
-void encodeTypeProtoTensorType(onnx::TypeProto_Tensor* tensor_type, Value* n) {
+void encodeTypeProtoTensorType(onnx::TypeProtoTensor* tensor_type, Value* n) {
   onnx::TensorShapeProto* shape = tensor_type->mutable_shape();
   if (TensorTypePtr node_type = n->type()->cast<TensorType>()) {
     const std::vector<std::int64_t>& sizes = node_type->sizes();
-    for (size_t i = 0; i < sizes.size(); i++) {
-      shape->add_dim();
-      shape->mutable_dim(i)->set_dim_value(sizes[i]);
+    for (std::int64_t s : sizes) {
+      shape->add_dim(s);
     }
-    onnx::TensorProto_DataType onnx_type;
+    onnx::DataType onnx_type;
     switch(node_type->scalarType()) {
       case at::kDouble:
-        onnx_type = onnx::TensorProto_DataType_DOUBLE;
+        onnx_type = onnx::kDOUBLE;
         break;
       case at::kFloat:
-        onnx_type = onnx::TensorProto_DataType_FLOAT;
+        onnx_type = onnx::kFLOAT;
         break;
       case at::kHalf:
-        onnx_type = onnx::TensorProto_DataType_FLOAT16;
+        onnx_type = onnx::kFLOAT16;
         break;
       case at::kByte:
-        onnx_type = onnx::TensorProto_DataType_UINT8;
-        break;
       case at::kChar:
-        onnx_type = onnx::TensorProto_DataType_INT8;
+        onnx_type = onnx::kINT8;
         break;
       case at::kShort:
-        onnx_type = onnx::TensorProto_DataType_INT16;
+        onnx_type = onnx::kINT16;
         break;
       case at::kInt:
-        onnx_type = onnx::TensorProto_DataType_INT32;
+        onnx_type = onnx::kINT32;
         break;
       case at::kLong:
-        onnx_type = onnx::TensorProto_DataType_INT64;
+        onnx_type = onnx::kINT64;
         break;
       default:
         AT_ERROR("unexpected tensor scalar type");
         break;
     }
-    tensor_type->set_elem_type(onnx_type);
+    tensor_type->set_data_type(onnx_type);
   }
 }
 
 void encodeValueInfo(onnx::ValueInfoProto* v, Value* n) {
   v->set_name(value_name(n));
   onnx::TypeProto* t = v->mutable_type();
-  onnx::TypeProto_Tensor* tensor_type = t->mutable_tensor_type();
+  onnx::TypeProtoTensor* tensor_type = t->mutable_tensor_type();
   encodeTypeProtoTensorType(tensor_type, n);
 }
 
@@ -235,7 +226,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b,
     encodeValueInfo(v, output);
   }
   for (auto node : b->nodes()) {
-    bool is_raw_export = ctx->operator_export_type == onnx_torch::OperatorExportTypes::RAW;
+    bool is_raw_export = ctx->operator_export_type == onnx::OperatorExportTypes::RAW;
     if (node->kind() == prim::Undefined && !is_raw_export) {
       // Undefined nodes are used to implement optional inputs. One
       // way to "not provide" an optional input is to create an
@@ -262,7 +253,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b,
       JIT_ASSERT(!node->kind().is_onnx());
       p_n->set_domain(node->kind().domainString());
     }
-    else if (ctx->operator_export_type != onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK) {
+    else if (ctx->operator_export_type != onnx::OperatorExportTypes::ONNX_ATEN_FALLBACK) {
       JIT_ASSERT(node->kind().is_onnx());
     }
     p_n->set_op_type(node->kind().toUnqualString());
@@ -272,7 +263,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b,
     if (is_raw_export && node->blocks().size() > 0) {
       auto blocks = p_n->add_attribute();
       blocks->set_name("_blocks");
-      blocks->set_type(onnx::AttributeProto_AttributeType_GRAPHS);
+      blocks->set_type(onnx::aGRAPHS);
       for (auto block : node->blocks()) {
         auto graph = blocks->add_graphs();
         encodeBlock(graph, block, initializers, ctx, raw_data_export_map);
@@ -283,7 +274,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b,
 
       auto body = p_n->add_attribute();
       body->set_name("body");
-      body->set_type(onnx::AttributeProto_AttributeType_GRAPH);
+      body->set_type(onnx::aGRAPH);
       auto g = body->mutable_g();
       encodeBlock(g, node->blocks()[0], {}, ctx, raw_data_export_map);
     }
@@ -292,13 +283,13 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b,
 
       auto true_branch = p_n->add_attribute();
       true_branch->set_name("then_branch");
-      true_branch->set_type(onnx::AttributeProto_AttributeType_GRAPH);
+      true_branch->set_type(onnx::aGRAPH);
       auto true_g = true_branch->mutable_g();
       encodeBlock(true_g, node->blocks()[0], {}, ctx, raw_data_export_map);
 
       auto false_branch = p_n->add_attribute();
       false_branch->set_name("else_branch");
-      false_branch->set_type(onnx::AttributeProto_AttributeType_GRAPH);
+      false_branch->set_type(onnx::aGRAPH);
       auto false_g = false_branch->mutable_g();
       encodeBlock(false_g, node->blocks()[1], {}, ctx, raw_data_export_map);
     }
@@ -309,7 +300,7 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b,
   for (auto & tensor : initializers) {
     // TODO: stop using positions to determine which initializers
     // match to which inputs
-    std::string name = p_g->input(inputs_count++).name();
+    std::string name = p_g->get_input_name(inputs_count++);
     auto p = p_g->add_initializer();
     p->set_name(name);
     if (raw_data_export_map) {
@@ -323,8 +314,8 @@ void encodeBlock(onnx::GraphProto * p_g, Block *b,
 void encodeModel(onnx::ModelProto* p_m, const std::shared_ptr<Graph>& g,
                  const std::vector<at::Tensor>& initializers,
                  RawDataExportMap* raw_data_export_map = nullptr,
-                 onnx_torch::OperatorExportTypes operator_export_type
-                   = onnx_torch::OperatorExportTypes::ONNX) {
+                 onnx::OperatorExportTypes operator_export_type
+                   = onnx::OperatorExportTypes::ONNX) {
   onnx::GraphProto* p_g = p_m->mutable_graph();
   ExportContext ctx;
   ctx.operator_export_type = operator_export_type;
@@ -343,7 +334,7 @@ std::string getNodeStackTraceString(Node* n) {
 }
 } // namespace
 
-void validateGraph(const std::shared_ptr<Graph>& graph, onnx_torch::OperatorExportTypes operator_export_type) {
+void validateGraph(const std::shared_ptr<Graph>& graph, onnx::OperatorExportTypes operator_export_type) {
   for (auto node : graph->nodes()) {
       // Macro'ed so we get a marginally better line number on failed export
 #define FAIL_EXPORT(name) \
@@ -365,7 +356,7 @@ void validateGraph(const std::shared_ptr<Graph>& graph, onnx_torch::OperatorExpo
             "Cannot export individual pack_padded_sequence or pad_packed_sequence; these operations must occur in pairs.\n\nUsage of this operation occurred at:\n" +
             getNodeStackTraceString(node));
       }
-      bool is_aten_fallback = operator_export_type == onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK;
+      bool is_aten_fallback = operator_export_type == onnx::OperatorExportTypes::ONNX_ATEN_FALLBACK;
       if (!node->kind().is_onnx() && !is_aten_fallback && node->kind() != prim::Undefined) {
         FAIL_EXPORT(
             "Couldn't export operator " + node->kind().toDisplayString() + "\n\nDefined at:\n" +
@@ -376,182 +367,6 @@ void validateGraph(const std::shared_ptr<Graph>& graph, onnx_torch::OperatorExpo
   }
 }
 
-// Pretty printing
-namespace {
-constexpr char indent_char = ' ';
-constexpr size_t indent_multiplier = 2;
-
-std::string idt(size_t indent) {
-  return std::string(indent * indent_multiplier, indent_char);
-}
-
-std::string nlidt(size_t indent) {
-  return std::string("\n") + idt(indent);
-}
-
-void dump(const onnx::TensorProto& tensor, std::ostream& stream) {
-  stream << "TensorProto shape: [";
-  for (int i = 0; i < tensor.dims_size(); ++i) {
-    stream << tensor.dims(i) << (i == tensor.dims_size() - 1 ? "" : " ");
-  }
-  stream << "]";
-}
-
-void dump(const onnx::TensorShapeProto& shape, std::ostream& stream) {
-  for (int i = 0; i < shape.dim_size(); ++i) {
-    auto &dim = shape.dim(i);
-    if (dim.has_dim_value()) {
-      stream << dim.dim_value();
-    } else {
-      stream << "?";
-    }
-    stream << (i == shape.dim_size() - 1 ? "" : " ");
-  }
-}
-
-void dump(const onnx::TypeProto_Tensor& tensor_type, std::ostream& stream) {
-  stream << "Tensor dims: ";
-  dump(tensor_type.shape(), stream);
-}
-
-void dump(const onnx::TypeProto& type, std::ostream& stream) {
-  dump(type.tensor_type(), stream);
-}
-
-void dump(const onnx::ValueInfoProto& value_info, std::ostream& stream) {
-  stream << "{name: \"" << value_info.name()
-         << "\", type:";
-  dump(value_info.type(), stream);
-  stream << "}";
-}
-
-void dump(const onnx::GraphProto& graph, std::ostream& stream, size_t indent);
-
-void dump(const onnx::AttributeProto& attr, std::ostream& stream, size_t indent) {
-  stream << "{ name: '" << attr.name() << "', type: ";
-  if (attr.has_f()) {
-    stream << "float, value: " << attr.f();
-  } else if (attr.has_i()) {
-    stream << "int, value: " << attr.i();
-  } else if (attr.has_s()) {
-    stream << "string, value: '" << attr.s() << "'";
-  } else if (attr.has_g()) {
-    stream << "graph, value:\n";
-    dump(attr.g(), stream, indent+1);
-    stream << nlidt(indent);
-  } else if (attr.has_t()) {
-    stream << "tensor, value:";
-    dump(attr.t(), stream);
-  } else if (attr.floats_size()) {
-    stream << "floats, values: [";
-    for (int i = 0; i < attr.floats_size(); ++i)
-      stream << attr.floats(i) << (i == attr.floats_size() - 1 ? "" : " ");
-    stream << "]";
-  } else if (attr.ints_size()) {
-    stream << "ints, values: [";
-    for (int i = 0; i < attr.ints_size(); ++i)
-      stream << attr.ints(i) << (i == attr.ints_size() - 1 ? "" : " ");
-    stream << "]";
-  } else if (attr.strings_size()) {
-    stream << "strings, values: [";
-    for (int i = 0; i < attr.strings_size(); ++i)
-      stream << "'" << attr.strings(i) << "'" << (i == attr.strings_size() - 1 ? "" : " ");
-    stream << "]";
-  } else if (attr.tensors_size()) {
-    stream << "tensors, values: [";
-    for (auto& t : attr.tensors()) {
-      dump(t, stream);
-    }
-    stream << "]";
-  } else if (attr.graphs_size()) {
-    stream << "graphs, values: [";
-    for (auto& g : attr.graphs()) {
-      dump(g, stream, indent+1);
-    }
-    stream << "]";
-  } else {
-    stream << "UNKNOWN";
-  }
-  stream << "}";
-}
-
-void dump(const onnx::NodeProto& node, std::ostream& stream, size_t indent) {
-  stream << "Node {type: \"" << node.op_type() << "\", inputs: [";
-  for (int i = 0; i < node.input_size(); ++i) {
-    stream << node.input(i) << (i == node.input_size() - 1 ? "" : ",");
-  }
-  stream << "], outputs: [";
-  for (int i = 0; i < node.output_size(); ++i) {
-    stream << node.output(i) << (i == node.output_size() - 1 ? "" : ",");
-  }
-  stream << "], attributes: [";
-  for (int i = 0; i < node.attribute_size(); ++i) {
-    dump(node.attribute(i), stream, indent+1);
-    stream << (i == node.attribute_size() - 1 ? "" : ",");
-  }
-  stream << "]}";
-}
-
-void dump(const onnx::GraphProto& graph, std::ostream& stream, size_t indent) {
-  stream << idt(indent) << "GraphProto {" << nlidt(indent+1)
-         << "name: \"" << graph.name() << "\"" << nlidt(indent+1)
-         << "inputs: [";
-  for (int i = 0; i < graph.input_size(); ++i) {
-    dump(graph.input(i), stream);
-    stream << (i == graph.input_size() - 1 ? "" : ",");
-  }
-  stream << "]" << nlidt(indent+1)
-         << "outputs: [";
-  for (int i = 0; i < graph.output_size(); ++i) {
-    dump(graph.output(i), stream);
-    stream << (i == graph.output_size() - 1 ? "" : ",");
-  }
-  stream << "]" << nlidt(indent+1)
-         << "initializers: [";
-  for (int i = 0; i < graph.initializer_size(); ++i) {
-    dump(graph.initializer(i), stream);
-    stream << (i == graph.initializer_size() - 1 ? "" : ",");
-  }
-  stream << "]" << nlidt(indent+1)
-         << "nodes: [" << nlidt(indent+2);
-  for (int i = 0; i < graph.node_size(); ++i) {
-    dump(graph.node(i), stream, indent+2);
-    if (i != graph.node_size() - 1) stream << "," << nlidt(indent+2);
-  }
-  stream << nlidt(indent+1) << "]\n" << idt(indent) << "}\n";
-}
-
-void dump(const onnx::OperatorSetIdProto& operator_set_id, std::ostream& stream) {
-  stream << "OperatorSetIdProto { domain: " << operator_set_id.domain() << "}";
-}
-
-void dump(const onnx::ModelProto& model, std::ostream& stream, size_t indent) {
-  stream << idt(indent)
-         << "ModelProto {" << nlidt(indent+1)
-         << "producer_name: \"" << model.producer_name() << "\"" << nlidt(indent+1)
-         << "domain: \"" << model.domain() << "\"" << nlidt(indent+1)
-         << "doc_string: \"" << model.doc_string() << "\"";
-  if (model.has_graph()) {
-    stream << nlidt(indent+1) << "graph:\n";
-    dump(model.graph(), stream, indent+2);
-  }
-  if (model.opset_import_size()) {
-    stream << idt(indent+1) << "opset_import: [";
-    for (auto &opset_imp : model.opset_import()) {
-      dump(opset_imp, stream);
-    }
-    stream << "],\n";
-  }
-  stream << idt(indent) << "}\n";
-}
-} // namespace
-
-std::string prettyPrint(const onnx::ModelProto& model) {
-  std::stringstream ss;
-  dump(model, ss, 0);
-  return ss.str();
-}
-
 }
 
 namespace {
@@ -561,15 +376,14 @@ RawDataExportMap ToModelProto(
     const std::vector<at::Tensor> & initializers,
     int64_t onnx_opset_version,
     bool defer_weight_export,
-    onnx_torch::OperatorExportTypes operator_export_type,
+    onnx::OperatorExportTypes operator_export_type,
     onnx::ModelProto *model_proto) {
-  if (operator_export_type != onnx_torch::OperatorExportTypes::RAW) {
+  if (operator_export_type != onnx::OperatorExportTypes::RAW) {
     validateGraph(graph, operator_export_type);
   }
 
   model_proto->set_producer_name("pytorch");
   model_proto->set_producer_version("0.3");
-  model_proto->set_ir_version(onnx::IR_VERSION);
   auto* imp = model_proto->add_opset_import();
   // This is the version of ONNX operator set we are targeting
   imp->set_version(onnx_opset_version);
@@ -597,12 +411,12 @@ std::string PrettyPrintExportedGraph(
                         int64_t onnx_opset_version,
                         bool defer_weight_export,
                         ::torch::onnx::OperatorExportTypes operator_export_type) {
-  ::ONNX_NAMESPACE::ModelProto model_proto;
+  ::torch::onnx::ModelProto model_proto;
   RawDataExportMap raw_data_export_map;
   raw_data_export_map = ToModelProto(
     graph, initializers, onnx_opset_version, defer_weight_export, operator_export_type,
     &model_proto);
-  return prettyPrint(model_proto);
+  return model_proto.prettyPrint();
 }
 
 // export_raw_ir will export IR ops without turning them into ONNX ops.
@@ -616,12 +430,21 @@ std::tuple<std::string, RawDataExportMap> ExportGraph(
                         int64_t onnx_opset_version,
                         bool defer_weight_export,
                         ::torch::onnx::OperatorExportTypes operator_export_type) {
-  ::ONNX_NAMESPACE::ModelProto model_proto;
+  ::torch::onnx::ModelProto model_proto;
   RawDataExportMap raw_data_export_map;
   raw_data_export_map = ToModelProto(
     graph, initializers, onnx_opset_version, defer_weight_export, operator_export_type,
     &model_proto);
-  return std::make_tuple(model_proto.SerializeAsString(), raw_data_export_map);
+
+  size_t out_size;
+  pb_get_encoded_size(&out_size, onnx_ModelProto_fields, &model_proto.proto);
+
+  // Allocate storage and export the graph
+  std::string out(out_size, '\0');
+  pb_ostream_t ostream = pb_ostream_from_buffer(reinterpret_cast<pb_byte_t *>(&out[0]), out_size);
+  pb_encode(&ostream, onnx_ModelProto_fields, &model_proto.proto);
+
+  return std::make_tuple(out, raw_data_export_map);
 }
 
 }}
diff --git a/torch/csrc/jit/fusion_compiler.cpp b/torch/csrc/jit/fusion_compiler.cpp
index 22f8b40ba30542..8d20045efefe6a 100644
--- a/torch/csrc/jit/fusion_compiler.cpp
+++ b/torch/csrc/jit/fusion_compiler.cpp
@@ -345,14 +345,18 @@ std::vector<ConcatDesc> emitCompilationUnit(std::ostream & out,
     size_t i = 0;
     for(auto o : subgraph.outputs()) {
       auto & desc = agraph.output_desc[i++];
-      if(o->node()->kind() != prim::FusedConcat) {
+      if(o->node()->kind() != aten::cat) {
         emitFormal(o, desc);
         concat_desc.emplace_back();
         flat_output_nodes.push_back(o);
       } else {
         auto cat = o->node();
-        concat_desc.emplace_back(desc, cat->inputs().size(), cat->i(attr::dim));
-        for(auto c : cat->inputs()) {
+        auto tensor_inputs = cat->inputs();
+        // We need to drop the dim arg
+        tensor_inputs = tensor_inputs.slice(0, tensor_inputs.size() - 1);
+        size_t nInputs = tensor_inputs.size();
+        concat_desc.emplace_back(desc, nInputs, cat->get<int64_t>(attr::dim).value());
+        for(auto c : tensor_inputs) {
           emitFormal(c, *concat_desc.back().subtensorDesc);
           flat_output_nodes.push_back(c);
         }
@@ -382,9 +386,8 @@ std::vector<ConcatDesc> emitCompilationUnit(std::ostream & out,
   }
 
   for(auto n : subgraph.nodes()) {
-    // FusedConcat nodes work by narrowing the output Tensors before the kernel runs
-    if (n->kind() == prim::FusedConcat)
-      continue;
+    if(n->kind() == aten::cat)
+      continue; // Concat nodes by narrowing the output Tensors before the kernel runs
     env.s("node",valueName(n->output()));
     env.s("rhs", encodeRHS(n));
     body << format("auto ${node} = ${rhs};\n",env);
diff --git a/torch/csrc/jit/fusion_compiler.h b/torch/csrc/jit/fusion_compiler.h
index c2f35ee0aa2074..6c4759aefb692a 100644
--- a/torch/csrc/jit/fusion_compiler.h
+++ b/torch/csrc/jit/fusion_compiler.h
@@ -86,7 +86,7 @@ struct CompiledFusionFunction {
   TH_DISALLOW_COPY_AND_ASSIGN(CompiledFusionFunction);
 
   CompiledFusionFunction(const std::string & name, AnnotatedGraph & agraph);
-  virtual ~CompiledFusionFunction() = default;
+  virtual ~CompiledFusionFunction() {}
 
   // expects outputs to be pre-allocated
   void launch_with_tensors(at::ArrayRef<at::Tensor> inputs, at::ArrayRef<at::Tensor> outputs);
diff --git a/torch/csrc/jit/graph_executor.cpp b/torch/csrc/jit/graph_executor.cpp
index 56a836b312d0c7..df81c378ad137d 100644
--- a/torch/csrc/jit/graph_executor.cpp
+++ b/torch/csrc/jit/graph_executor.cpp
@@ -21,7 +21,6 @@
 #include "torch/csrc/jit/passes/specialize_undef.h"
 #include "torch/csrc/jit/passes/loop_unrolling.h"
 #include "torch/csrc/jit/passes/lower_grad_of.h"
-#include "torch/csrc/jit/passes/constant_propagation.h"
 #include "torch/csrc/jit/symbolic_variable.h"
 #include "torch/csrc/jit/ivalue.h"
 
@@ -241,7 +240,14 @@ struct GraphExecutorImpl {
   , symbolically_differentiable(symbolically_differentiable)
   , may_introduce_gradient(calcMayIntroduceGradient(this->graph->block())) {}
   GraphExecutorImpl(std::shared_ptr<Graph> graph, bool optimize)
-  : GraphExecutorImpl(graph, optimize, isDifferentiable(*graph)) {}
+  : GraphExecutorImpl(graph, optimize, isDifferentiable(*graph)) {
+    for(auto input : graph->inputs()) {
+      JIT_ASSERTM(input->type()->kind() != TypeKind::TupleType, "tuples cannot be inputs to the graph");
+    }
+    for(auto output : graph->outputs()) {
+      JIT_ASSERTM(output->type()->kind() != TypeKind::TupleType, "tuples cannot be outputs to the graph");
+    }
+  }
 
   // entry point where execution begins
   void run(Stack & stack) {
@@ -510,28 +516,28 @@ void runRequiredPasses(const std::shared_ptr<Graph>& g)  {
   RemoveExpands(g);
 }
 
-void specializeToSpec(const std::shared_ptr<Graph>& graph, const ArgumentSpec& spec) {
+void specializeToSpec(const std::shared_ptr<Graph>& graph_, const ArgumentSpec& spec) {
   // clean up GradOf and AutogradAdd nodes
   // this must be first because later passes do not know what GradOfs are
   std::vector<bool> defined;
   for(size_t i = 0; i < spec.size(); ++i) {
     defined.push_back(spec.at(i).defined());
   }
-  specializeUndef(*graph, defined);
+  specializeUndef(*graph_, defined);
 
   // required passes shared with autograd fallback
-  runRequiredPasses(graph);
+  runRequiredPasses(graph_);
 
   // Decompose addmm nodes to add + mm, so expands can be inserted and
   // gradients accumulated on the backward pass
   //
   // In the future, if we need more passes like this, we should convert this
   // into a generic canonicalization pass.
-  DecomposeAddmm(graph);
+  DecomposeAddmm(graph_);
   // clean up dead constants from specialization
-  EliminateDeadCode(graph);
+  EliminateDeadCode(graph_);
   // calculate all input shapes
-  PropagateInputShapes(*graph, spec);
+  PropagateInputShapes(*graph_, spec);
 }
 
 void runOptimization(std::shared_ptr<Graph> & graph, bool graphMustSupportVariables) {
@@ -548,7 +554,7 @@ void runOptimization(std::shared_ptr<Graph> & graph, bool graphMustSupportVariab
 
     // They also may assume that concrete sizes/strides are availiable
     UnrollLoops(graph);
-    ConstantPropagation(graph);
+
     //TODO: create peephole optimizations that are safe to run
     // when we are using variables, and when we do not know sizes.
     PeepholeOptimize(graph);
diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h
index 2693af50af1025..4e862c9e0a1e44 100644
--- a/torch/csrc/jit/graph_executor.h
+++ b/torch/csrc/jit/graph_executor.h
@@ -34,7 +34,7 @@ struct GraphExecutorState {
 
 struct GraphExecutorImpl;
 struct TORCH_API GraphExecutor {
-  GraphExecutor() = default;
+  GraphExecutor() {}
   GraphExecutor(std::shared_ptr<Graph> graph, bool optimize = true);
   // note: if not specified, symbolically_differentiable is computed from the graph.
   GraphExecutor(std::shared_ptr<Graph> graph, bool optimize, bool symbolically_differentiable);
diff --git a/torch/csrc/jit/graph_node_list.h b/torch/csrc/jit/graph_node_list.h
index 054b9517776863..996a8b2c75fa0f 100644
--- a/torch/csrc/jit/graph_node_list.h
+++ b/torch/csrc/jit/graph_node_list.h
@@ -1,5 +1,3 @@
-#pragma once
-
 #include "torch/csrc/jit/assertions.h"
 
 namespace torch { namespace jit {
diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index a453925cf2f8eb..5b128fd822dafd 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -1,5 +1,5 @@
 #include "torch/csrc/jit/import.h"
-#include "onnx/onnx.pb.h"
+#include "torch/csrc/onnx/onnx.npb.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/utils/functional.h"
 #include "torch/csrc/jit/assertions.h"
@@ -16,60 +16,401 @@ namespace torch { namespace jit {
 
 namespace {
 
-// IR graph construction
+// Deserialized data
+
+struct Tensor_ {
+  std::vector<int64_t> dims;
+  std::vector<uint8_t> raw_data;
+  onnx_TensorProto_DataType data_type;
+};
+
+struct AttributeValue_ {
+  std::string name;
+  onnx_AttributeProto_AttributeType type;
+  double f;
+  int64_t i;
+  std::string s;
+  Tensor_ t;
+  std::string g;
+  std::vector<double> fs;
+  std::vector<int64_t> is;
+  std::vector<std::string> ss;
+  std::vector<Tensor_> ts;
+  std::vector<std::string> gs;
+};
+
+struct Value_ {
+  std::string name;
+};
+
+struct Node_ {
+  std::string op_type;
+  std::string domain;
+  std::vector<std::string> inputs;
+  std::vector<std::string> outputs;
+  std::vector<AttributeValue_> attrs;
+};
+
+struct Graph_ {
+  std::vector<Value_> inputs;
+  std::vector<Value_> outputs;
+  std::vector<Node_> nodes;
+  std::vector<Tensor_> initializers;
+};
+
+struct Model_ {
+  Graph_ graph;
+};
+
+
+// Readers
+
+struct ReaderBase {
+  ReaderBase() {}
+  ReaderBase(pb_callback_t& cb) {
+    initialize_callback(cb);
+  }
+
+  void initialize_callback(pb_callback_t& cb) {
+    cb.funcs.decode = ReaderBase::decode;
+    cb.arg = this;
+  }
+
+  virtual void decode(pb_istream_t *stream) = 0;
+
+  static bool decode(pb_istream_t *stream, const pb_field_t *, void **_self) {
+    ReaderBase* self = *reinterpret_cast<ReaderBase* const *>(_self);
+    self->decode(stream);
+    return true;
+  }
+};
+
+
+template<typename T>
+struct Reader : ReaderBase {};
+
+template<typename T>
+struct Reader<std::vector<T>> : Reader<T> {
+  Reader(pb_callback_t& cb) : Reader<T>(cb) {}
+  // Decode is going to be called repeatedly from the callback
+  // (registered in the parent class constructor) each time an
+  // element is encountered. So all we do is relay the decoding
+  // through the parent class decode and push the result, every
+  // time this decode is called.
+  virtual void decode(pb_istream_t *stream) override {
+    Reader<T>::decode(stream);
+    values.push_back(std::move(Reader<T>::value));
+  }
+  std::vector<T> values;
+};
+
+template<>
+struct Reader<std::string> : ReaderBase {
+  Reader(pb_callback_t& cb) : ReaderBase(cb) {}
+  virtual void decode(pb_istream_t *stream) override {
+    // For string and bytes, the length value has already been
+    // parsed, and is available at stream->bytes_left.
+    std::vector<uint8_t> res(stream->bytes_left);
+    if (!pb_read(stream, res.data(), stream->bytes_left)) {
+      throw std::runtime_error("Decoding failed");
+    }
+    value.assign(res.begin(), res.end());
+  }
+  std::string value;
+};
+
+template<>
+struct Reader<double> : ReaderBase {
+  Reader(pb_callback_t& cb) : ReaderBase(cb) {}
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode_fixed32(stream, &value)) {
+      throw std::runtime_error("Decoding failed");
+    }
+  }
+  double value;
+};
+
+template<>
+struct Reader<int64_t> : ReaderBase {
+  Reader(pb_callback_t& cb) : ReaderBase(cb) {}
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode_varint(stream, reinterpret_cast<uint64_t*>(&value))) {
+      throw std::runtime_error("Decoding failed");
+    }
+  }
+  int64_t value;
+};
+
+template<>
+struct Reader<std::vector<uint8_t>> : ReaderBase {
+  Reader(pb_callback_t& cb) : ReaderBase(cb) {}
+  virtual void decode(pb_istream_t *stream) override {
+    // For string and bytes, the length value has already been
+    // parsed, and is available at stream->bytes_left.
+    value.resize(stream->bytes_left);
+    if (!pb_read(stream, value.data(), stream->bytes_left)) {
+      throw std::runtime_error("Decoding failed");
+    }
+  }
+  std::vector<uint8_t> value;
+};
+
+template<>
+struct Reader<Tensor_> : ReaderBase {
+  Reader()
+    : proto(onnx_TensorProto_init_default)
+    , dims_reader(proto.dims)
+    , raw_data_reader(proto.raw_data)
+  {}
+
+  Reader(pb_callback_t& cb)
+    : Reader() { initialize_callback(cb); }
+
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode(stream, onnx_TensorProto_fields, &proto)) {
+      throw std::runtime_error("Decoding failed");
+    }
 
-namespace onnx = ::ONNX_NAMESPACE;
+    value.dims = std::move(dims_reader.values);
+    value.raw_data = std::move(raw_data_reader.value);
+    value.data_type = proto.data_type;
+  }
 
-at::Tensor buildTensor(const onnx::TensorProto& tensor_proto) {
+  onnx_TensorProto proto;
+  Reader<std::vector<int64_t>> dims_reader;
+  Reader<std::vector<uint8_t>> raw_data_reader;
+  Tensor_ value;
+};
+
+template<>
+struct Reader<AttributeValue_> : ReaderBase {
+  Reader()
+    : proto(onnx_AttributeProto_init_default)
+    , name_reader(proto.name)
+    , str_reader(proto.s)
+    , tensor_reader(proto.t)
+    , graph_reader(proto.g)
+    , floats_reader(proto.floats)
+    , ints_reader(proto.ints)
+    , strings_reader(proto.strings)
+    , tensors_reader(proto.tensors)
+    , graphs_reader(proto.graphs) {}
+
+  Reader(pb_callback_t& cb)
+    : Reader() { initialize_callback(cb); }
+
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode(stream, onnx_AttributeProto_fields, &proto)) {
+      throw std::runtime_error("Decoding failed");
+    }
+
+    value.name = std::move(name_reader.value);
+    value.type = proto.type;
+    value.f = proto.f;
+    value.i = proto.i;
+    value.s = std::move(str_reader.value);
+    value.t = std::move(tensor_reader.value);
+    value.g = std::move(graph_reader.value);
+    value.fs = std::move(floats_reader.values);
+    value.is = std::move(ints_reader.values);
+    value.ss = std::move(strings_reader.values);
+    value.ts = std::move(tensors_reader.values);
+    value.gs = std::move(graphs_reader.values);
+  }
+
+  onnx_AttributeProto proto;
+  Reader<std::string> name_reader;
+  Reader<std::string> str_reader;
+  Reader<Tensor_> tensor_reader;
+  Reader<std::string> graph_reader;
+  Reader<std::vector<double>> floats_reader;
+  Reader<std::vector<int64_t>> ints_reader;
+  Reader<std::vector<std::string>> strings_reader;
+  Reader<std::vector<Tensor_>> tensors_reader;
+  Reader<std::vector<std::string>> graphs_reader;
+  AttributeValue_ value;
+};
+
+template<>
+struct Reader<Value_> : ReaderBase {
+  Reader()
+    : proto(onnx_ValueInfoProto_init_default)
+    , name_reader(proto.name) {}
+  Reader(pb_callback_t& cb)
+    : Reader() { initialize_callback(cb); }
+
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode(stream, onnx_ValueInfoProto_fields, &proto)) {
+      throw std::runtime_error("Decoding failed");
+    }
+
+    value.name = std::move(name_reader.value);
+  }
+
+  onnx_ValueInfoProto proto;
+  Reader<std::string> name_reader;
+  Value_ value;
+};
+
+
+template<>
+struct Reader<Node_> : ReaderBase {
+  Reader()
+    : proto(onnx_NodeProto_init_default)
+    , op_type_reader(proto.op_type)
+    , domain_reader(proto.domain)
+    , inputs_reader(proto.input)
+    , outputs_reader(proto.output)
+    , attrs_reader(proto.attribute)
+  {}
+  Reader(pb_callback_t& cb)
+    : Reader() { initialize_callback(cb); }
+
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode(stream, onnx_NodeProto_fields, &proto)) {
+      throw std::runtime_error("Decoding failed");
+    }
+
+    value.op_type = std::move(op_type_reader.value);
+    value.domain = std::move(domain_reader.value);
+    value.inputs = std::move(inputs_reader.values);
+    value.outputs = std::move(outputs_reader.values);
+    value.attrs = std::move(attrs_reader.values);
+  }
+
+  onnx_NodeProto proto;
+  Reader<std::string> op_type_reader;
+  Reader<std::string> domain_reader;
+  Reader<std::vector<std::string>> inputs_reader;
+  Reader<std::vector<std::string>> outputs_reader;
+  Reader<std::vector<AttributeValue_>> attrs_reader;
+  Node_ value;
+};
+
+
+template<>
+struct Reader<Graph_> : ReaderBase {
+  Reader()
+    : proto(onnx_GraphProto_init_default)
+    , input_reader(proto.input)
+    , output_reader(proto.output)
+    , node_reader(proto.node)
+    , initializer_reader(proto.initializer)
+  {}
+  Reader(pb_callback_t& cb)
+    : Reader() { initialize_callback(cb); }
+
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode(stream, onnx_GraphProto_fields, &proto)) {
+      throw std::runtime_error("Decoding failed");
+    }
+
+    value.inputs = std::move(input_reader.values);
+    value.outputs = std::move(output_reader.values);
+    value.nodes = std::move(node_reader.values);
+    value.initializers = std::move(initializer_reader.values);
+  }
+
+  static Graph_ read(pb_istream_t *stream) {
+    Reader<Graph_> reader;
+    reader.decode(stream);
+    return reader.value;
+  }
+
+  onnx_GraphProto proto;
+  Reader<std::vector<Value_>> input_reader;
+  Reader<std::vector<Value_>> output_reader;
+  Reader<std::vector<Node_>> node_reader;
+  Reader<std::vector<Tensor_>> initializer_reader;
+  Graph_ value;
+};
+
+
+template<>
+struct Reader<Model_> : ReaderBase {
+  Reader()
+    : proto(onnx_ModelProto_init_default)
+    , graph_reader(proto.graph) {}
+  Reader(pb_callback_t& cb)
+    : Reader() { initialize_callback(cb); }
+
+  virtual void decode(pb_istream_t *stream) override {
+    if (!pb_decode(stream, onnx_ModelProto_fields, &proto)) {
+      throw std::runtime_error("Decoding failed");
+    }
+
+    value.graph = std::move(graph_reader.value);
+  }
+
+  static Model_ read(pb_istream_t *stream) {
+    Reader<Model_> reader;
+    reader.decode(stream);
+    return reader.value;
+  }
+
+  onnx_ModelProto proto;
+  Reader<Graph_> graph_reader;
+  Model_ value;
+};
+
+
+// IR graph construction
+
+at::Tensor buildTensor(const Tensor_& tensor_) {
 
   at::Tensor tensor;
 
-  switch(tensor_proto.data_type()) {
-    case onnx::TensorProto_DataType_UINT8:
+  switch(tensor_.data_type) {
+    case onnx_TensorProto_DataType_UINT8:
       tensor = at::CPU(at::kByte).tensor();
       break;
-    case onnx::TensorProto_DataType_INT8:
+    case onnx_TensorProto_DataType_INT8:
       tensor = at::CPU(at::kChar).tensor();
       break;
-    case onnx::TensorProto_DataType_INT16:
+    case onnx_TensorProto_DataType_INT16:
       tensor = at::CPU(at::kShort).tensor();
       break;
-    case onnx::TensorProto_DataType_INT32:
+    case onnx_TensorProto_DataType_INT32:
       tensor = at::CPU(at::kInt).tensor();
       break;
-    case onnx::TensorProto_DataType_INT64:
+    case onnx_TensorProto_DataType_INT64:
       tensor = at::CPU(at::kLong).tensor();
       break;
-    case onnx::TensorProto_DataType_FLOAT16:
+    case onnx_TensorProto_DataType_FLOAT16:
       tensor = at::CPU(at::kHalf).tensor();
       break;
-    case onnx::TensorProto_DataType_FLOAT:
+    case onnx_TensorProto_DataType_FLOAT:
       tensor = at::CPU(at::kFloat).tensor();
       break;
-    case onnx::TensorProto_DataType_DOUBLE:
+    case onnx_TensorProto_DataType_DOUBLE:
       tensor = at::CPU(at::kDouble).tensor();
       break;
     default:
       throw std::runtime_error("Unsupported data type");
   }
 
-  std::vector<int64_t> sizes = {tensor_proto.dims().begin(), tensor_proto.dims().end()};
-  tensor.resize_(sizes);
+  tensor.resize_(tensor_.dims);
 
   JIT_ASSERT(
       tensor.storage()->pImpl()->get_size() *
           tensor.storage()->pImpl()->elementSize() ==
-      tensor_proto.raw_data().size());
+      tensor_.raw_data.size());
 
-  std::memcpy(tensor.data_ptr(), tensor_proto.raw_data().data(), tensor_proto.raw_data().size());
+  std::memcpy(tensor.data_ptr(), tensor_.raw_data.data(), tensor_.raw_data.size());
 
   return tensor;
 }
 
-void buildBlock(const onnx::GraphProto& graph_proto, Block* block,
+Graph_ readSubgraph(const std::string& serialized_subgraph) {
+  pb_istream_t istream = pb_istream_from_buffer(reinterpret_cast<const pb_byte_t *>(serialized_subgraph.data()), serialized_subgraph.size());
+
+  return Reader<Graph_>::read(&istream);
+}
+
+void buildBlock(const Graph_& graph_, Block* block,
                 std::unordered_map<std::string, Value*>& value_map);
 
-void buildBlocks(const std::vector<onnx::GraphProto>& graphs_, Node* node,
+void buildBlocks(const std::vector<Graph_>& graphs_, Node* node,
                  std::unordered_map<std::string, Value*>& value_map) {
   for (auto g_ : graphs_) {
     auto block = node->addBlock();
@@ -77,96 +418,97 @@ void buildBlocks(const std::vector<onnx::GraphProto>& graphs_, Node* node,
   }
 }
 
-std::shared_ptr<Graph> buildGraph(const onnx::GraphProto& graph_proto) {
+std::shared_ptr<Graph> buildGraph(const Graph_& graph_) {
   auto graph = std::make_shared<Graph>();
   std::unordered_map<std::string, Value*> value_map;
 
-  buildBlock(graph_proto, graph->block(), value_map);
+  buildBlock(graph_, graph->block(), value_map);
 
   return graph;
 }
 
-void buildBlock(const onnx::GraphProto& graph_proto, Block* block,
+void buildBlock(const Graph_& graph_, Block* block,
                 std::unordered_map<std::string, Value*>& value_map) {
 
-  for (auto & input : graph_proto.input()) {
-    value_map[input.name()] = block->addInput();
+  for (auto & input : graph_.inputs) {
+    value_map[input.name] = block->addInput();
   }
 
-  for (auto & node_ : graph_proto.node()) {
-    JIT_ASSERT(node_.op_type() != "PythonOp");
+  for (auto & node_ : graph_.nodes) {
+    JIT_ASSERT(node_.op_type != "PythonOp");
 
-    auto node = block->owningGraph()->create(Symbol::fromDomainAndUnqualString(node_.domain(), node_.op_type()),
-                                             node_.output().size());
+    auto node = block->owningGraph()->create(Symbol::fromDomainAndUnqualString(node_.domain, node_.op_type),
+                                             node_.outputs.size());
 
-    for (auto & attr : node_.attribute()) {
-      Symbol name = Symbol::attr(attr.name());
+    for (auto & attr : node_.attrs) {
+      Symbol name = Symbol::attr(attr.name);
 
-      switch(attr.type()) {
-        case onnx::AttributeProto_AttributeType_UNDEFINED:
+      switch(attr.type) {
+        case onnx_AttributeProto_AttributeType_UNDEFINED:
           throw std::runtime_error("UNDEFINED attribute unsupported");
           break;
-        case onnx::AttributeProto_AttributeType_FLOAT:
-          node->f_(name, attr.f());
+        case onnx_AttributeProto_AttributeType_FLOAT:
+          node->f_(name, attr.f);
           break;
-        case onnx::AttributeProto_AttributeType_INT:
-          node->i_(name, attr.i());
+        case onnx_AttributeProto_AttributeType_INT:
+          node->i_(name, attr.i);
           break;
-        case onnx::AttributeProto_AttributeType_STRING:
-          node->s_(name, std::move(attr.s()));
+        case onnx_AttributeProto_AttributeType_STRING:
+          node->s_(name, std::move(attr.s));
           break;
-        case onnx::AttributeProto_AttributeType_TENSOR:
-          node->t_(name, buildTensor(attr.t()));
+        case onnx_AttributeProto_AttributeType_TENSOR:
+          node->t_(name, buildTensor(attr.t));
           break;
-        case onnx::AttributeProto_AttributeType_GRAPH:
-          node->g_(name, buildGraph(attr.g()));
+        case onnx_AttributeProto_AttributeType_GRAPH:
+          node->g_(name, buildGraph(readSubgraph(attr.g)));
           break;
-        case onnx::AttributeProto_AttributeType_FLOATS:
-          node->fs_(name, {attr.floats().begin(), attr.floats().end()});
+        case onnx_AttributeProto_AttributeType_FLOATS:
+          node->fs_(name, std::move(attr.fs));
           break;
-        case onnx::AttributeProto_AttributeType_INTS:
-          node->is_(name, {attr.ints().begin(), attr.ints().end()});
+        case onnx_AttributeProto_AttributeType_INTS:
+          node->is_(name, std::move(attr.is));
           break;
-        case onnx::AttributeProto_AttributeType_STRINGS:
-          node->ss_(name, {attr.strings().begin(), attr.strings().end()});
+        case onnx_AttributeProto_AttributeType_STRINGS:
+          node->ss_(name, std::move(attr.ss));
           break;
-        case onnx::AttributeProto_AttributeType_TENSORS:
-          node->ts_(name, fmap(attr.tensors(), [](const onnx::TensorProto& t) { return buildTensor(t); }));
+        case onnx_AttributeProto_AttributeType_TENSORS:
+          node->ts_(name, fmap(attr.ts, [](const Tensor_& t) { return buildTensor(t); }));
           break;
-        case onnx::AttributeProto_AttributeType_GRAPHS:
-          if (attr.name() == "_blocks") {
-            buildBlocks({attr.graphs().begin(), attr.graphs().end()}, node, value_map);
+        case onnx_AttributeProto_AttributeType_GRAPHS:
+          if (attr.name == "_blocks") {
+            buildBlocks(fmap(attr.gs, [](const std::string& g) { return readSubgraph(g); }), node, value_map);
           }
           else {
-            node->gs_(name, fmap(attr.graphs(), [](const onnx::GraphProto& g_) { return buildGraph(g_); }));
+            node->gs_(name, fmap(fmap(attr.gs, [](const std::string& g) { return readSubgraph(g); } ),
+                                               [](const Graph_& g_) { return buildGraph(g_); }));
           }
           break;
       }
     }
 
-    for (auto & input : node_.input()) {
+    for (auto & input : node_.inputs) {
       auto v = value_map[input];
       node->addInput(v);
     }
 
-    for (int i=0; i<node_.output().size(); i++) {
-      value_map[node_.output(i)] = node->outputs()[i];
+    for (size_t i=0; i<node_.outputs.size(); i++) {
+      value_map[node_.outputs[i]] = node->outputs()[i];
     }
 
     block->appendNode(node);
   }
 
-  for (auto & output : graph_proto.output()) {
-    Value* v = value_map.at(output.name());
+  for (auto & output : graph_.outputs) {
+    Value* v = value_map.at(output.name);
     block->registerOutput(v);
   }
 }
 
-std::shared_ptr<Graph> buildGraph(const onnx::GraphProto& graph_proto, std::vector<at::Tensor>& initializers) {
+std::shared_ptr<Graph> buildGraph(const Graph_& graph_, std::vector<at::Tensor>& initializers) {
 
-  auto graph = buildGraph(graph_proto);
+  auto graph = buildGraph(graph_);
 
-  for (auto tensor_ : graph_proto.initializer()) {
+  for (auto tensor_ : graph_.initializers) {
     initializers.push_back(buildTensor(tensor_));
   }
 
@@ -215,10 +557,12 @@ void reconstructOutputTypes(Block *b) {
 
 std::shared_ptr<Graph> ImportIRGraph(const std::string& serialized_graph,
                                      std::vector<at::Tensor>& initializers) {
-  auto model_proto = ::ONNX_NAMESPACE::ModelProto();
-  model_proto.ParseFromString(serialized_graph);
 
-  auto graph = buildGraph(model_proto.graph(), initializers);
+  pb_istream_t istream = pb_istream_from_buffer(reinterpret_cast<const pb_byte_t *>(serialized_graph.data()), serialized_graph.size());
+
+  auto model = Reader<Model_>::read(&istream);
+
+  auto graph = buildGraph(model.graph, initializers);
 
   reconstructOutputTypes(graph->block());
 
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index 5363eda02ff528..d3a9bd9139a96e 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -18,7 +18,6 @@
 #include "torch/csrc/jit/passes/onnx/fixup_onnx_loop.h"
 #include "torch/csrc/jit/passes/shape_analysis.h"
 #include "torch/csrc/jit/passes/decompose_addmm.h"
-#include "torch/csrc/jit/passes/constant_propagation.h"
 #include "torch/csrc/jit/passes/loop_unrolling.h"
 #include "torch/csrc/jit/passes/to_batch.h"
 #include "torch/csrc/jit/passes/specialize_undef.h"
@@ -71,14 +70,11 @@ void initJITBindings(PyObject *module) {
    })
    .def("_jit_pass_lint", LintGraph)
    .def("_jit_pass_shape_analysis", [](Graph& graph, py::tuple inputs, bool with_grad) {
-     PropagateInputShapes(graph, ArgumentSpec(with_grad, createStack(inputs, graph.inputs())));
+     PropagateInputShapes(graph, ArgumentSpec(with_grad, createStack(inputs)));
    })
    .def("_jit_pass_remove_expands", RemoveExpands)
    .def("_jit_pass_erase_number_types", EraseNumberTypes)
    .def("_jit_pass_loop_unrolling", UnrollLoops)
-   .def("_jit_pass_constant_propagation", [](std::shared_ptr<Graph>& g) {
-     return ConstantPropagation(g);
-   })
    .def("_jit_run_cpp_tests", [] {
      // We have to release the GIL inside this method, because if we happen to
      // initialize the autograd engine in these tests, the newly spawned worker threads will
@@ -186,16 +182,15 @@ void initJITBindings(PyObject *module) {
         return ge.graph();
       })
       .def("graph_for", [](GraphExecutor& ge, py::args args) {
-        return ge.graphFor(createStack(args, ge.graph()->inputs()));
+        return ge.graphFor(createStack(args));
       })
       .def("get_debug_state", [](GraphExecutor& ge) {
         return ge.getDebugState();
       })
       .def("__call__", [](GraphExecutor& ge, py::args args) -> py::object {
-        const auto & graph = ge.graph();
-        auto stack = createStack(args, graph->inputs());
+        auto stack = createStack(args);
         ge.run(stack);
-        return wrapStack(std::move(stack), graph->outputs());
+        return wrapStack(std::move(stack));
       });
 
 
diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h
index c567793552d73a..52b8cb0eaccd98 100644
--- a/torch/csrc/jit/interned_strings.h
+++ b/torch/csrc/jit/interned_strings.h
@@ -50,7 +50,6 @@ _(prim, TensorToNum) \
 _(prim, AutogradAdd) \
 _(prim, GradOf) \
 _(prim, AnyDefined) \
-_(prim, FusedConcat) \
 _(aten, __not__) \
 FORALL_ATEN_BASE_SYMBOLS(_) \
 _(onnx, Add) \
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index 0c1fe17ade0dfd..65bdcf695f6de2 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -337,9 +337,9 @@ struct PreprocessGraph {
 struct ContainerTensor : public at::TensorImpl {
 public:
   ContainerTensor()
-  : TensorImpl(at::Backend::Undefined,at::ScalarType::Undefined, nullptr, /* is_variable */ false) {}
+  : TensorImpl(&(at::globalContext().getType(at::Backend::Undefined,at::ScalarType::Undefined)), nullptr) {}
 
-  virtual ~ContainerTensor() = default;
+  virtual ~ContainerTensor() {}
   virtual at::IntList sizes() const override {
     throw std::runtime_error("sizes() on ContainerTensor");
   }
@@ -685,8 +685,8 @@ struct CodeImpl {
 
 // InterpreterState state that is held across stages and used to compute a Code
 struct InterpreterStateImpl {
-  InterpreterStateImpl(const Code & code)
-  : function(code.pImpl),
+  InterpreterStateImpl(const Code & function_)
+  : function(function_.pImpl),
     int_data(function->int_data.data()),
     bool_data(function->bool_data),
     registers(function->register_size) {
@@ -775,15 +775,15 @@ std::ostream & operator<<(std::ostream & out, const Code & code) {
 
 Code::Code(std::shared_ptr<Graph>& graph)
     : pImpl(new CodeImpl(graph)) {}
-Code::~Code() = default;
+Code::~Code() {}
 
 const std::vector<GraphExecutor*>& Code::executors() {
   return pImpl->executors();
 }
 
-InterpreterState::InterpreterState(const Code & code)
-  : pImpl(new InterpreterStateImpl(code)) {}
-InterpreterState::~InterpreterState() = default;
+InterpreterState::InterpreterState(const Code & function)
+  : pImpl(new InterpreterStateImpl(function)) {}
+InterpreterState::~InterpreterState() {}
 
 void InterpreterState::runOneStage(Stack & stack) {
   return pImpl->runOneStage(stack);
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index ede14249c46dce..7f09b22b324d11 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -44,9 +44,9 @@ std::ostream& operator<<(std::ostream & out, const at::ArrayRef<T> & nodes) {
 }
 
 struct const_value_list_with_types {
-  const ArrayRef<const Value*> values;
+  const std::vector<const Value*>& values;
   bool use_newlines;
-  const_value_list_with_types(ArrayRef<const Value*> values, bool use_newlines = false)
+  const_value_list_with_types(const std::vector<const Value*>& values, bool use_newlines = false)
     : values(values), use_newlines(use_newlines) {}
 };
 std::ostream& operator<<(std::ostream & out, const_value_list_with_types l) {
@@ -355,7 +355,7 @@ void Graph::lint() const {
   // - every use will occur later in the topsort
 
   struct LintScope {
-    LintScope() = default;
+    LintScope() {}
     LintScope(std::unique_ptr<LintScope> parent)
     : parent(std::move(parent)) {}
     bool contains(const Value * v) {
@@ -487,13 +487,13 @@ void LintGraph(std::shared_ptr<Graph>& graph) {
   graph->lint();
 }
 
-void Block::cloneFrom(Block * src, std::function<Value*(Value*)> value_map) {
+void Block::cloneFrom(Block * src, std::function<Value*(Value*)> outer_map) {
   std::unordered_map<Value*, Value*> local_map;
   auto env = [&](Value * v) {
     auto it = local_map.find(v);
     if(it != local_map.end())
       return it->second;
-    return value_map(v);
+    return outer_map(v);
   };
 
   auto graph = owningGraph();
@@ -619,8 +619,23 @@ Value* Node::namedInput(Symbol name) const {
     // so this is completely unsafe and needs to be gone as soon as possible.
     return v;
   }
+  const auto & the_schema = schema();
+  int64_t tensor_list_pos = 0;
+  for (auto & arg : the_schema.arguments) {
+    if (*arg.type == *ListType::ofTensors())
+      break;
+    tensor_list_pos++;
+  }
   int64_t arg_pos = findArgument(schema(), name).first;
-  return input(arg_pos);
+  // XXX: we don't have a single value we could give for a Tensor[],
+  // because we flatten lists into arguments
+  JIT_ASSERT(arg_pos != tensor_list_pos);
+  // NB: if there's no tensor list, then tensor_list_pos == arguments.size(), so this is always true
+  if (arg_pos < tensor_list_pos) {
+    return input(arg_pos);
+  } else {
+    return input(inputs().size() - (the_schema.arguments.size() - arg_pos));
+  }
 }
 
 bool Node::matches(const char *signature_literal, at::ArrayRef<Symbol> const_inputs) {
@@ -631,12 +646,8 @@ bool Node::matches(const char *signature_literal, at::ArrayRef<Symbol> const_inp
   return true;
 }
 
-void Node::dump() const {
-  std::cout << *this << "\n";
-}
-
 void Node::findSchema() const {
-  schema_ = &getOperatorFor(this).schema();
+  schema_ = &getOperatorFor(this).schema;
 }
 
 PythonOp* defaultAllocPythonOp(Graph*g) {
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index b2caa642b6fe20..9af468e6ee06e7 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -54,7 +54,7 @@ struct Value;
 
 TORCH_API std::ostream& operator<<(std::ostream & out, const Graph & g);
 TORCH_API std::ostream& operator<<(std::ostream & out, const Type & t);
-TORCH_API std::ostream& operator<<(std::ostream & out, const Node & n);
+TORCH_API std::ostream& operator<<(std::ostream & out, const Node & t);
 
 // A list of nodes, with inputs and outputs
 struct Block;
@@ -683,9 +683,7 @@ struct Node : public Attributes<Node> {
     return *schema_;
   }
 
-  void dump() const;
-
-  virtual ~Node() = default;
+  virtual ~Node() {}
 private:
   std::pair<Value*, const Argument&> findInput(Symbol name);
   void findSchema() const;
@@ -891,7 +889,8 @@ friend struct Block;
   , block_(new Block(this, nullptr))
   , insert_before_(return_node()) {}
 
-  Graph() : Graph(std::make_shared<Scope>()) {}
+  Graph()
+  : Graph( std::make_shared<Scope>()) {}
 
   at::ArrayRef<Value*> inputs() {
     return block_->inputs();
diff --git a/torch/csrc/jit/ivalue.h b/torch/csrc/jit/ivalue.h
index 6eef40a0323068..42a5be89e55e4b 100644
--- a/torch/csrc/jit/ivalue.h
+++ b/torch/csrc/jit/ivalue.h
@@ -83,7 +83,6 @@ struct ConstantList;
 struct IValue;
 using Tuple = ConstantList<IValue>;
 using IntList = ConstantList<int64_t>;
-using TensorList = ConstantList<at::Tensor>;
 using DoubleList = ConstantList<double>;
 
 // IValue is the generic tagged union used by the interpreter to hold
@@ -94,7 +93,7 @@ using DoubleList = ConstantList<double>;
 // retain/release calls.
 
 #define TORCH_FORALL_TAGS(_) \
-  _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList) _(TensorList)
+  _(None) _(Tensor) _(Double) _(Int) _(Tuple) _(IntList) _(DoubleList)
 
 struct IValue {
   IValue()
@@ -224,20 +223,6 @@ struct IValue {
     return toRetainable<DoubleList>();
   }
 
-  //TensorList
-  IValue(Shared<TensorList> v);
-  IValue(std::vector<at::Tensor> v);
-  bool isTensorList() const { return Tag::TensorList == tag; }
-  Shared<TensorList> toTensorList() && {
-    JIT_ASSERT(isTensorList());
-    return moveToRetainable<TensorList>();
-  }
-  Shared<TensorList> toTensorList() const & {
-    JIT_ASSERT(isTensorList());
-    return toRetainable<TensorList>();
-  }
-
-  // None
   bool isNone() {
     return Tag::None == tag;
   }
@@ -384,15 +369,8 @@ inline IValue::IValue(Shared<DoubleList> v)
 inline IValue::IValue(std::vector<double> v)
 : IValue(DoubleList::create(std::move(v))) {}
 
-inline IValue::IValue(Shared<TensorList> v)
-: tag(Tag::TensorList), retainable(true) {
-  as_retainable = v.detach();
-}
-inline IValue::IValue(std::vector<at::Tensor> v)
-: IValue(TensorList::create(std::move(v))) {}
-
 inline std::vector<int64_t> IValue::copyToIntList() const {
-  return toIntList()->elements().vec();
+  return std::vector<int64_t>(toIntList()->elements());
 }
 
 }}
diff --git a/torch/csrc/jit/operator.cpp b/torch/csrc/jit/operator.cpp
index 5cb2c2c11ad5a7..f19d18caa9289e 100644
--- a/torch/csrc/jit/operator.cpp
+++ b/torch/csrc/jit/operator.cpp
@@ -248,12 +248,8 @@ std::string canonicalSchemaString(const FunctionSchema& schema) {
 
 using OperatorMap = std::unordered_map<Symbol, std::vector<std::shared_ptr<Operator>>>;
 struct OperatorRegistry  {
-private:
-  std::mutex lock;
   OperatorMap operators;
-  // list of operators whose schema have not yet been parsed, and must
-  // be registered before any call to lookup an opeator
-  std::vector<std::shared_ptr<Operator>> to_register;
+  std::mutex lock;
   // Those two maps are used to implement lookupByLiteral, which is needed for the n->match(...) calls.
   // Basically, every function schema is assigned a unique string you can use to match it. However,
   // parsing those strings or comparing and hashing them character by character would be very slow, so
@@ -264,26 +260,18 @@ struct OperatorRegistry  {
   // by performing a lookup in the operators_by_sig map.
   std::unordered_map<std::string, std::shared_ptr<Operator>> operators_by_sig;
   std::unordered_map<const char *, std::shared_ptr<Operator>> operators_by_sig_literal;
+  void registerOperator(Operator&& op){
+    std::lock_guard<std::mutex> guard(lock);
 
-  // XXX - caller must be holding lock
-  void registerPendingOperators() {
-    for(auto op : to_register) {
-      Symbol sym = Symbol::fromQualString(op->schema().name);
-      operators[sym].push_back(op);
-      operators_by_sig[canonicalSchemaString(op->schema())] = op;
-    }
-    to_register.clear();
-  }
+    Symbol sym = Symbol::fromQualString(op.schema.name);
+    auto op_ptr = std::make_shared<Operator>(std::move(op));
 
-public:
-  void registerOperator(Operator&& op) {
-    std::lock_guard<std::mutex> guard(lock);
-    to_register.push_back(std::make_shared<Operator>(std::move(op)));
+    operators[sym].push_back(op_ptr);
+
+    operators_by_sig[canonicalSchemaString(op.schema)] = op_ptr;
   }
 
   const std::shared_ptr<Operator>& lookupByLiteral(const char * name) {
-    std::lock_guard<std::mutex> guard(lock);
-    registerPendingOperators();
     auto it = operators_by_sig_literal.find(name);
     if (it == operators_by_sig_literal.end()) {
       auto op_ptr_it = operators_by_sig.find(name);
@@ -301,10 +289,8 @@ struct OperatorRegistry  {
     return it->second;
   }
 
-
   const std::vector<std::shared_ptr<Operator>>& getOperators(Symbol name) {
     std::lock_guard<std::mutex> guard(lock);
-    registerPendingOperators();
     static std::vector<std::shared_ptr<Operator>> empty;
     auto it = operators.find(name);
     if(it != operators.end())
@@ -356,16 +342,16 @@ bool typeMatches(TypePtr actual, TypePtr formal) {
 }
 
 bool Operator::matches(const Node* node) const {
-  if (node->kind().toQualString() != schema().name) {
+  if (node->kind().toQualString() != schema.name) {
     return false;
   }
   size_t attributes_size = node->numAttributes();
   size_t attributes_seen = 0;
   auto inputs_size = node->inputs().size();
   size_t input_i = 0;
-  for(size_t arg_i = 0; arg_i < schema().arguments.size(); ++arg_i) {
+  for(size_t arg_i = 0; arg_i < schema.arguments.size(); ++arg_i) {
     at::optional<AttributeKind> attribute_kind;
-    const Argument& arg = schema().arguments[arg_i];
+    const Argument& arg = schema.arguments[arg_i];
     if(attributes_size > 0 && (attribute_kind = attributeKindOf(arg.type))) {
       auto name = Symbol::fromQualString("attr::" + arg.name);
       if(!node->hasAttribute(name) || node->kindOf(name) != *attribute_kind) {
@@ -373,6 +359,22 @@ bool Operator::matches(const Node* node) const {
         return false;
       }
       attributes_seen++;
+    } else if(*arg.type == *ListType::ofTensors()) {
+      // Tensor[] is handled as varargs, consume inputs until the remaining required arguments
+      // XXX - there can only be a single Tensor[] in a declaration
+      size_t remaining_required = 0;
+      for(size_t j = arg_i + 1; j < schema.arguments.size(); ++j){
+        // remaining arguments are only those that won't be consumed from attributes
+        if(attributes_size == 0 || !attributeKindOf(schema.arguments[j].type))
+          remaining_required++;
+      }
+      while(inputs_size - input_i > remaining_required) {
+        auto input = node->inputs()[input_i++];
+        if(!typeMatches(input->type(), DynamicType::get())) {
+          // std::cout << "vararg argument is not Dynamic\n";
+          return false;
+        }
+      }
     } else {
       if(input_i == inputs_size) {
         // std::cout << "not enough inputs\n";
@@ -386,11 +388,11 @@ bool Operator::matches(const Node* node) const {
     }
   }
 
-  if(!schema().is_vararg && input_i != inputs_size) {
+  if(!schema.is_vararg && input_i != inputs_size) {
     // std::cout << "not all inputs used\n" << input_i << " " << inputs_size << "\n";
     return false;
   }
-  if(!schema().is_vararg && attributes_seen != attributes_size) {
+  if(!schema.is_vararg && attributes_seen != attributes_size) {
     // std::cout << "not all attributes used\n" << attributes_seen << " " << attributes_size << "\n";
     return false;
   }
@@ -424,7 +426,7 @@ const Operator& getOperatorFor(const Node* node) {
   er << "\ncandidates were:\n";
   const auto& candidates = getAllOperatorsFor(node->kind());
   for(auto & candidate : candidates) {
-    er << "  " << candidate->schema() << "\n";
+    er << "  " << candidate->schema << "\n";
   }
   throw er;
 }
@@ -434,7 +436,7 @@ OperatorSet::OperatorSet(std::initializer_list<const char *> sig_literals) {
   auto & registry = getRegistry();
   for (const char * sig : sig_literals) {
     auto op = registry.lookupByLiteral(sig);
-    ops[Symbol::fromQualString(op->schema().name)].push_back(op);
+    ops[Symbol::fromQualString(op->schema.name)].push_back(op);
   }
 }
 
diff --git a/torch/csrc/jit/operator.h b/torch/csrc/jit/operator.h
index be2c20b01a5379..7e6a314d2cb8c3 100644
--- a/torch/csrc/jit/operator.h
+++ b/torch/csrc/jit/operator.h
@@ -2,81 +2,57 @@
 // once C10 exists this can be removed, or stubbed out, but we need
 // it now to implement correct semantic checking for script
 #pragma once
-
+#include "ATen/ATen.h"
 #include "torch/csrc/jit/assertions.h"
 #include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/function_schema.h"
 #include "torch/csrc/jit/stack.h"
 
-#include "ATen/ATen.h"
-
-#include <functional>
-#include <initializer_list>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
 namespace torch { namespace jit {
 
-FunctionSchema parseSchema(const std::string& schema);
+FunctionSchema parseSchema(const std::string& decl);
 
 using OperationCreator = std::function<Operation(Node*)>;
 
 struct TORCH_API Operator {
-  Operator(FunctionSchema schema, OperationCreator op_creator)
-      : schema_(std::make_shared<FunctionSchema>(std::move(schema))),
-        op_creator_(std::move(op_creator)) {}
+  Operator(FunctionSchema schema, OperationCreator op, OperationCreator op_const_attributes = nullptr)
+    : schema(std::move(schema))
+    , op(std::move(op))
+    , op_const_attributes(std::move(op_const_attributes)) {}
 
-  Operator(const std::string& schema, OperationCreator op_creator)
-      : schema_string_(schema), op_creator_(std::move(op_creator)) {}
+  Operator(const std::string& schema, OperationCreator op, OperationCreator op_const_attributes = nullptr)
+    : Operator(parseSchema(schema), std::move(op), std::move(op_const_attributes)) {}
 
-  // Helper constructor to register `op` to run
+  // Helper constructor to regsiter `op` to run
   // run for _every_ IR Node where n.kind() == name, regardless of arguments.
-  // This is accomplished by marking the schema varargs and having no required
-  // arguments. This is used for things like prim::While or prim::If that can
-  // take a number of different valid input types and lengths.
-  Operator(Symbol name, OperationCreator op_creator)
-      : Operator(FunctionSchema(name, {}, {}, true), std::move(op_creator)) {}
-
-  Operator(FunctionSchema schema, Operation op)
-      : schema_(std::make_shared<FunctionSchema>(std::move(schema))),
-        op_(std::make_shared<Operation>(std::move(op))) {}
-
-  Operator(const std::string& schema, Operation op)
-      : schema_string_(schema),
-        op_(std::make_shared<Operation>(std::move(op))) {}
-
-  bool matches(const Node* node) const;
-
-  Operation getOperation(Node* node = nullptr) const {
-    if (op_) {
-      return *op_;
+  // This is accomplished by marking the schema varargs and having no required arguments.
+  // This is used for things like prim::While or prim::If that can take a number
+  // of different valid input types and lengths.
+  Operator(Symbol name, OperationCreator op)
+  : Operator(FunctionSchema(name, {}, {}, true), op, op) {}
+
+  FunctionSchema schema;
+
+  bool matches(const Node* n) const;
+  // Operators have different versions depending on if some inputs are encoded
+  // as attributes or inputs. This function returns the right Operation function,
+  // given a node encoded for one variant.
+  // Behavior is undefined if matches(n) == false
+  // TODO (apaszke) : remove
+  Operation selectVariant(Node* n) const {
+    if(n->hasAttributes()) {
+      JIT_ASSERT(op_const_attributes != nullptr);
+      return op_const_attributes(n);
+    } else {
+      return op(n);
     }
-    AT_ASSERT(node != nullptr);
-    return op_creator_(node);
   }
-
-  const FunctionSchema & schema() const {
-    // we lazily parse schema initialized from strings so that
-    // we do less work during static operator registration
-    if(!schema_) {
-      schema_ = std::make_shared<FunctionSchema>(parseSchema(schema_string_.value()));
-      schema_string_ = at::nullopt;
-    }
-    return *schema_;
+  bool hasAttributedVersion() const {
+    return op_const_attributes != nullptr;
   }
 private:
-  mutable at::optional<std::string> schema_string_;
-  // cannot use at::optional because windows has issues that require an assignment operator to be generated
-  // cannot use std::unique_ptr because initializer lists of Operators end up copying the Operator
-  mutable std::shared_ptr<FunctionSchema> schema_;
-
-  // Essentially a variant<Operation, OperationCreator>.
-  // NB: std::function has a default state (where it == nullptr).
-  std::shared_ptr<Operation> op_;
-  OperationCreator op_creator_;
+  OperationCreator op;
+  OperationCreator op_const_attributes;
 };
 
 const std::vector<std::shared_ptr<Operator>>& getAllOperatorsFor(Symbol name);
@@ -86,7 +62,7 @@ const Operator& getOperatorFor(const Node* node);
 inline Operation getOperation(Node* node) {
   // note: getOperatorFor ensures that getOperatorFor(node).matches(node) == true
   // so the call to selectVariant is always valid.
-  return getOperatorFor(node).getOperation(node);
+  return getOperatorFor(node).selectVariant(node);
 }
 
 void registerOperator(Operator&& op);
diff --git a/torch/csrc/jit/passes/batch_mm.cpp b/torch/csrc/jit/passes/batch_mm.cpp
index 414dc1652a4da1..0e40bc8831a6df 100644
--- a/torch/csrc/jit/passes/batch_mm.cpp
+++ b/torch/csrc/jit/passes/batch_mm.cpp
@@ -3,9 +3,8 @@
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
 #include "torch/csrc/jit/interned_strings.h"
 #include "torch/csrc/jit/constants.h"
-#include "torch/csrc/jit/symbolic_variable.h"
-#include "torch/csrc/jit/assertions.h"
 #include "torch/csrc/utils/functional.h"
+#include "torch/csrc/jit/assertions.h"
 
 #include <ATen/ATen.h>
 #include <algorithm>
@@ -192,11 +191,12 @@ void BatchMMBlock(Block* block) {
       int cat_dim    = s == Side::LHS ? 1 : 0;
       cat_sizes[cat_dim] *= matmuls.size(); // make them really cat_sizes
 
+      auto inputs = fmap(matmuls, [=](Node *mm) { return mm->inputs()[inputs_off]; });
       WithInsertPoint iguard { root.node };
-      auto inputs = fmap(matmuls, [=](Node *mm) -> SymbolicVariable { return mm->inputs()[inputs_off]; });
-      auto cat_output = SymbolicVariable::cat(inputs, cat_dim).value();
-      cat_output->setType(type->withSizes(cat_sizes));
-      return cat_output;
+      inputs.push_back(insertConstant(*graph, cat_dim));
+      Node *cat = graph->insertNode(graph->create(aten::cat, inputs));
+      cat->output()->setType(type->withSizes(cat_sizes));
+      return cat->output();
     };
 
     auto lhs_batch = batch_inputs(Side::LHS, root.lhs_sizes);
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
deleted file mode 100644
index 39492f9e76c50c..00000000000000
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-#include "torch/csrc/jit/passes/constant_propagation.h"
-#include "torch/csrc/autograd/variable.h"
-#include "torch/csrc/jit/constants.h"
-#include "torch/csrc/jit/interpreter.h"
-#include "torch/csrc/jit/ir.h"
-#include "torch/csrc/jit/ivalue.h"
-#include "torch/csrc/jit/operator.h"
-#include "torch/csrc/jit/passes/dead_code_elimination.h"
-#include "torch/csrc/utils/functional.h"
-
-namespace torch { namespace jit {
-
-namespace {
-
-std::unordered_set<Symbol> skip_list = {
-  //FIXME If & Loop require special casing because they cannot be run as a
-  //single node.
-  prim::If,
-  prim::Loop,
-  //FIXME Same problem as in DCE - cpp & python PythonOp and CppOp should be
-  //FIXME treated as having side effects but ONNX depends on them being removed
-  prim::Print,
-  //all the rand functions from native_functions.yaml
-  aten::permute,
-  aten::rand,
-  aten::rand_out,
-  aten::rand_like,
-  aten::randint,
-  aten::randint_out,
-  aten::randint_like,
-  aten::randn,
-  aten::randn_out,
-  aten::randn_like,
-  aten::randperm,
-  aten::randperm_out,
- };
-
-std::vector<IValue> runNode(Node* n) {
-  auto op = getOperation(n);
-  Stack stack;
-  for (auto input : n->inputs()) {
-    stack.push_back(*(toIValue(input)));
-  }
-  op(stack);
-  auto var_outputs = fmap(stack, [&](IValue v) {
-    if (v.isTensor()) {
-      return IValue(autograd::as_variable_ref(v.toTensor()).data());
-    } else {
-      return v;
-    }
-  });
-  return var_outputs;
-}
-
-void propagateNode(Node* n) {
-  auto outputs = runNode(n);
-  auto graph = n->owningGraph();
-  WithInsertPoint guard(n);
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    auto new_output = insertConstant(*graph, outputs[i]);
-    n->outputs()[i]->replaceAllUsesWith(new_output);
-    // let dce elimination remove n
-  }
-}
-
-} // anonymous namespace
-
-void ConstantPropagation(Node* n, bool recurse) {
-  bool constant_inputs = (n->inputs().size() > 0) &&
-    std::all_of(n->inputs().begin(), n->inputs().end(), [&](Value* v) {
-      return v->node()->kind() == prim::Constant;
-    });
-  bool supported_node = skip_list.count(n->kind()) == 0;
-  if (constant_inputs && supported_node) {
-    propagateNode(n);
-  }
-  if (recurse) {
-    for (Block * block : n->blocks())
-      ConstantPropagation(block, recurse);
-  }
-}
-
-void ConstantPropagation(Block* block, bool recurse) {
-  ConstantPropagation(block->param_node(), recurse);
-  for (auto n: block->nodes()) {
-    ConstantPropagation(n, recurse);
-  }
-}
-
-void ConstantPropagation(std::shared_ptr<Graph>& graph) {
-  ConstantPropagation(graph->block(), true);
-  EliminateDeadCode(graph);
-}
-
-}}
diff --git a/torch/csrc/jit/passes/constant_propagation.h b/torch/csrc/jit/passes/constant_propagation.h
deleted file mode 100644
index 12df329c81ccfc..00000000000000
--- a/torch/csrc/jit/passes/constant_propagation.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-
-#include "torch/csrc/jit/ir.h"
-
-namespace torch { namespace jit {
-
-TORCH_API void ConstantPropagation(std::shared_ptr<Graph>& graph);
-TORCH_API void ConstantPropagation(Block* block, bool recurse);
-TORCH_API void ConstantPropagation(Node* n, bool recurse);
-
-}}
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index cc8dcb8926dee0..cb3757cffb0e34 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -177,25 +177,16 @@ struct GraphFuser {
     }
   }
 
-  bool isFusableCatNode(Node * node) {
-    if (node->kind() != aten::cat)
-      return false;
-    if (!node->is_constant(attr::dim))
-      return false;
+  bool allCatInputsHaveSameSize(Node * node) {
+    JIT_ASSERT(node->kind() == aten::cat);
+    std::vector<Value*> inputs = node->inputs();
+    if (!node->hasAttributes()) {
+      inputs.pop_back(); // Get rid of the dim argument
+    }
 
-    auto tensors_node = node->namedInput(attr::tensors)->node();
-    if (tensors_node->kind() != prim::ListConstruct) return false;
-    // NB: Note that technically other uses of the list aren't a big problem for us.
-    // It would be enough to place the prim::FusedConcat before the prim::ListConstruct, and
-    // allUsersAreThisConsumerOrOccurAfterIt would still be satisfied. However, I don't expect this
-    // to be necessary any time soon, and so we're simply assuming that we don't have to deal with that.
-    if (tensors_node->output()->uses().size() > 1) return false;
-    auto tensors = tensors_node->inputs();
-
-    // Our fusion code assumes that all inputs have the same shapes, so we need to check this too.
-    auto expected = tensors.at(0)->type()->cast<TensorType>();
+    auto expected = inputs.at(0)->type()->cast<TensorType>();
     if (!expected) return false;
-    return std::all_of(tensors.begin(), tensors.end(), [&expected](Value *v) {
+    return std::all_of(inputs.begin(), inputs.end(), [expected](Value *v) {
         auto actual = v->type()->cast<TensorType>();
         return actual && actual->sizes() == expected->sizes();
     });
@@ -206,7 +197,15 @@ struct GraphFuser {
   // because it is not a simple map, can be put in a fusion group
   // as long as no items in the group read the output of concat
   bool isFusableAsExitNode(Node * node) {
-    return isFusable(node) || isFusableCatNode(node);
+    if(isFusable(node))
+      return true;
+    // this concat fusion only works when all the inputs are the same size
+    // and we can statically infer the dimension along which we should concat
+    // otherwise they cannot partipate in the same map
+    if(node->kind() == aten::cat && node->is_constant(attr::dim) && allCatInputsHaveSameSize(node))
+      return true;
+
+    return false;
   }
 
   // necessary condition for fusion. If all of the uses of producer are consumer
@@ -242,9 +241,8 @@ struct GraphFuser {
     // we can move the consumer up into the producer.
     // but this requires better handling of merging fusion groups so it is not done now
     at::optional<int> consumer_device = getDevice(consumer);
-    Node *real_consumer = consumer->kind() == aten::cat ? consumer->namedInput(attr::tensors)->node() : consumer;
     return isFusable(producer->node()) &&
-      allUsersAreThisConsumerOrOccurAfterIt(real_consumer, producer) &&
+      allUsersAreThisConsumerOrOccurAfterIt(consumer, producer) &&
       consumer_device && consumer_device == getDevice(producer->node()) &&
       (*consumer_device != kCPUDevice || sharedFusionCompiler().canCompileOnCPU());
   }
@@ -391,24 +389,7 @@ struct GraphFuser {
 
   Node * fuse(Node * consumer, Value * producer) {
     auto group = consumer;
-    if (consumer->kind() == aten::cat) {
-      Graph * graph = consumer->owningGraph();
-      Node * list_construct = consumer->namedInput(attr::tensors)->node();
-      int64_t dim = consumer->get<int64_t>(attr::dim).value();
-
-      Node * fused_cat = graph->create(prim::FusedConcat, list_construct->inputs())->i_(attr::dim, dim);
-      fused_cat->insertBefore(list_construct);
-      fused_cat->output()->copyMetadata(consumer->output());
-      consumer->output()->replaceAllUsesWith(fused_cat->output());
-      topological_index[fused_cat] = topological_index[list_construct];
-
-      // NB: this deletes the fused_cat node from the original graph
-      group = createSingletonFusionGroup(fused_cat);
-      consumer->destroy();
-      if (list_construct->output()->uses().empty()) {
-        list_construct->destroy();
-      }
-    } else if (consumer->kind() != prim::FusionGroup) {
+    if(group->kind() != prim::FusionGroup) {
       group = createSingletonFusionGroup(consumer);
     }
     if (producer->node()->kind() == prim::FusionGroup) {
@@ -469,6 +450,7 @@ struct GraphFuser {
       }
     }
 
+    // TODO: Remove this restriction if we ever need to distribute across
     // multiple return operators
     Node * producer_for_chunk_node = producer_for_chunk->node();
     JIT_ASSERT(producer_for_chunk_node->outputs().size() == 1);
@@ -539,14 +521,11 @@ struct GraphFuser {
   std::pair<graph_node_list::iterator, bool> scanNode(Node * consumer) {
     auto stage_guard = block->owningGraph()->setStageTemporary(consumer->stage());
     if(isFusableAsExitNode(consumer)) {
-      value_list inputs;
-      auto consumer_inputs = consumer->kind() == aten::cat ?
-        consumer->namedInput(attr::tensors)->node()->inputs() :
-        consumer->inputs();
       // handle inputs in reverse topological order as well...
       // otherwise in f(a,a+b) it will appear a is used twice if we consider
       // the f-a fusion before the f-(a+b) fusion first.
-      for(auto i : consumer_inputs) {
+      value_list inputs;
+      for(auto i : consumer->inputs()) {
         if (i->node()->owningBlock() == block) {
           inputs.push_back(i);
           JIT_ASSERT(topological_index.count(i->node()) > 0);
diff --git a/torch/csrc/jit/passes/lower_grad_of.h b/torch/csrc/jit/passes/lower_grad_of.h
index 0ec3589e3acd31..a0a881e3002ed9 100644
--- a/torch/csrc/jit/passes/lower_grad_of.h
+++ b/torch/csrc/jit/passes/lower_grad_of.h
@@ -10,6 +10,6 @@ namespace torch { namespace jit {
 //  outputs = <original_computation>
 // else:
 //  outputs = undefineds
-TORCH_API void LowerGradOf(Graph& g);
+TORCH_API void LowerGradOf(Graph& graph);
 
 }}
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index ee9b76f417bd17..63fb7030aa3ad1 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -263,39 +263,6 @@ void PropagateShapeOnNode(Node * node, bool insert_expands) {
     default:
       break; // fall-through
   }
-  if (node->matches("aten::cat(Tensor[] tensors, int dim) -> Tensor", /*with_const=*/attr::dim)) {
-    auto list_node = node->namedInput(attr::tensors)->node();
-    JIT_ASSERT(list_node->kind() == prim::ListConstruct);
-    auto tensors = list_node->inputs();
-    if (tensors.size() > 0) {
-      auto input_types = fmap(tensors, [](Value *v) { return v->type()->cast<TensorType>(); });
-      if (std::all_of(input_types.begin(), input_types.end(),
-          [](const TensorTypePtr& tp) { return tp != nullptr; })) {
-        std::vector<int64_t> sizes = input_types[0]->sizes();
-        const int64_t dim = wrapDim(node->get<int64_t>(attr::dim).value(), sizes);
-        const int64_t ndim = sizes.size();
-
-        if (dim < 0 || dim >= ndim)
-          goto cat_fail;
-
-        sizes[dim] = 0;
-        for (auto & tp : input_types) {
-          auto & tp_sizes = tp->sizes();
-          if (sizes.size() != tp_sizes.size())
-            goto cat_fail;
-          for (int64_t i = 0; i < ndim; ++i) {
-            if (sizes[i] != tp_sizes[i] && i != dim) {
-              goto cat_fail;
-            }
-          }
-          sizes[dim] += tp_sizes[dim];
-        }
-        node->output()->setType(input_types[0]->withSizes(sizes));
-        return;
-      }
-    }
-  }
-cat_fail:
 
   bool can_propagate_by_running = canPropagateShapeByRunningIt(node);
   auto maybe_tensor_types = gatherTensorTypes(node);
diff --git a/torch/csrc/jit/passes/to_batch.cpp b/torch/csrc/jit/passes/to_batch.cpp
index f78da9b92baccc..5494cf2b78a798 100644
--- a/torch/csrc/jit/passes/to_batch.cpp
+++ b/torch/csrc/jit/passes/to_batch.cpp
@@ -3,530 +3,59 @@
 
 namespace torch { namespace jit {
 
-std::unordered_map<std::string, std::vector<std::shared_ptr<Graph>>> ToBatch::batch_operator_table;
-
-std::shared_ptr<Graph> ToBatch::getBatchOperator(std::string name, int64_t num_inputs){
-  if(batch_operator_table.find(name) == batch_operator_table.end()){
-    throw std::runtime_error("function " + name + " is not supported in batched tensor yet");
-  }
-  auto ops = batch_operator_table.at(name);
-  if(num_inputs == -1)  // default function
-    return ops[0];
-  for(auto op : ops){
-    if(size_t(num_inputs) == op->inputs().size())
-      return op;
-  }
-  throw std::runtime_error("function " + name + " with " + std::to_string(num_inputs) + " inputs is not supported in batched tensor yet");
-}
-
-// replace aten operator node with BatchTensor operator graph
-void ToBatch::visitAten(Node* n, Block* block, Block* res_block){
-  auto res_graph = res_block->owningGraph();
-  auto func_name = std::string(n->kind().toUnqualString());
-  std::vector<Value*> new_inputs;
-  for(Value *input : n->inputs()){
-    if(rn_env.find(input) == rn_env.end()){  // non-tensor input
-      auto new_input = batch_map.at(input);
-      new_inputs.insert(new_inputs.end(), new_input.begin(), new_input.end());
-    }
-    else{  // batched tensor input
-      new_inputs.push_back(rn_env.at(input));
-    }
-  }
-
-  // transform scalar to tensor before pass to batch operator script
-  for(size_t i = 0; i < new_inputs.size(); i++){
-    auto input = new_inputs[i];
-    if(input->type() == IntType::get() || input->type() == FloatType::get()){
-      auto to_tensor_node = res_graph->createNumToTensor(input);
-      res_graph->insertNode(to_tensor_node);
-      new_inputs[i] = to_tensor_node->output();
-    }
-  }
-
-  auto batch_graph = getBatchOperator(func_name, new_inputs.size());
-  auto outputs = script::inlineCallTo(*res_block->owningGraph(), *batch_graph, new_inputs);
-
-  // Assume all outputs from inlined operator implementation are in the triple form batched tensor or just a single non-tensor.
-  if(outputs.size() == 1){
-    // if previous output is scalar, transform new output back to scalar from dynamic
-    if(n->outputs()[0]->type() != outputs[0]->type()){
-      Node* to_scalar_node;
-      if(n->outputs()[0]->type() == IntType::get()){
-        to_scalar_node = res_graph->createTensorToNum(IntType::get(), outputs[0]);
-      }
-      else if(n->outputs()[0]->type() == FloatType::get()){
-        to_scalar_node = res_graph->createTensorToNum(FloatType::get(), outputs[0]);
-      }
-      else{
-        throw std::runtime_error("NYI: scalar type other than int, float is not supported yet");
-      }
-      res_graph->insertNode(to_scalar_node);
-      rn_env[n->outputs()[0]] = to_scalar_node->output();
-    }
-    else
-      rn_env[n->outputs()[0]] = outputs[0];
-  }
-  else{
-    for(size_t i = 0; i < n->outputs().size(); i++){
-      auto output = n->outputs()[i];
-      batch_map[output] = std::vector<Value*>(outputs.begin() + i * EXP_BTENSOR_SIZE, outputs.begin() + i * EXP_BTENSOR_SIZE + EXP_BTENSOR_SIZE);
-    }
-  }
-}
-
-// clone prim::Constant to new graph
-// batching transformation is applied to the output of prim::NumToTensor.
-// If there is a prim::NumToTensor following prim::Constant, it will be finally transformed to BatchTensor.
-void ToBatch::visitConstant(Node* n, Block* block, Block* res_block){
-  auto res_graph = res_block->owningGraph();
-  auto* r_node = res_graph->createClone(n, rn_fn);
-  r_node->setStage(n->stage());
-  res_block->appendNode(r_node);
-  rn_env[n->output()] = r_node->output();
-}
-
-// change return tensor to expanded batched tensor, eg: {data, mask, dims}
-void ToBatch::visitNumToTensor(Node* n, Block* block, Block* res_block){
-  auto res_graph = res_block->owningGraph();
-  auto* r_node = res_graph->createClone(n, rn_fn);
-  r_node->setStage(n->stage());
-  res_block->appendNode(r_node);
-  auto outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("batch_from_scalar_tensor"), r_node->outputs());
-  batch_map[n->output()] = outputs;
-}
-
-// clone prim::TensorToNum to new graph
-void ToBatch::visitTensorToNum(Node* n, Block* block, Block* res_block){
-  auto res_graph = res_block->owningGraph();
-  if(rn_env.find(n->input()) == rn_env.end()){
-    rn_env[n->input()] = batch_map.at(n->input())[0];
-  }
-  auto* r_node = res_graph->createClone(n, rn_fn);
-  r_node->setStage(n->stage());
-  res_block->appendNode(r_node);
-  rn_env[n->output()] = r_node->output();
-  batch_map[n->output()] = batch_map.at(n->input());
-}
-
-// clone prim::ListConstruct to new graph
-void ToBatch::visitListConstruct(Node* n, Block* block, Block* res_block){
-  auto res_graph = res_block->owningGraph();
-  if(n->inputs()[0]->type() == DynamicType::get()){  // TensorList: expand directly
-    std::vector<Value*> inputs;
-    for(Value* input: n->inputs()) {
-      auto res = batch_map.at(input);
-      inputs.insert(inputs.end(), res.begin(), res.end());
-    }
-    batch_map[n->output()] = inputs;
-  }
-  else {  // ScalarList: transform to tensor, then transform back
-    for(Value* input : n->inputs()) {
-      if(rn_env.find(input) == rn_env.end()){
-        rn_env[input] = batch_map.at(input)[0];
-      }
-    }
-    auto* r_node = res_graph->createClone(n, rn_fn);
-    r_node->setStage(n->stage());
-    res_block->appendNode(r_node);
-    // transform int[] to tensor
-    auto to_tensor_node = res_graph->create(Symbol::fromQualString("aten::_list_to_tensor"));
-    to_tensor_node->setStage(n->stage());
-    to_tensor_node->addInput(r_node->output());
-    res_block->appendNode(to_tensor_node);
-    rn_env[n->output()] = to_tensor_node->output();
-  }
-}
-
-// prim::If transformation:
-// elif is not supported
-//
-// transformation example:
-// @torch.jit.batch(batch_size=4)
-// def batch_if(a, b):
-//     if a > b:
-//         a += b
-//     else:
-//         a -= b
-//     return a
-//
-// original graph:
-// graph(%a.1 : Dynamic
-//       %b : Dynamic) {
-//   %2 : Dynamic = aten::gt(%a.1, %b)
-//   %a : Dynamic = prim::If(%2)
-//     block0() {
-//       %a.2 : Dynamic = aten::add[alpha={1}](%a.1, %b)
-//       -> (%a.2)
-//     }
-//     block1() {
-//       %a.3 : Dynamic = aten::sub[alpha={1}](%a.1, %b)
-//       -> (%a.3)
-//     }
-//   return (%a);
-// }
-//
-// transformed graph:
-// graph(%a.1_data : Dynamic
-//       %a.1_mask : Dynamic
-//       %a.1_dims : Dynamic
-//       %b_data : Dynamic
-//       %b_mask : Dynamic
-//       %b_dims : Dynamic) {
-//   %6 : Dynamic = aten::gt(%a.1_data, %b_data)  // calculate condition
-//   %7 : Dynamic = aten::mul(%a.1_mask, %b_mask)
-//   %8 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-//   %9 : int = prim::TensorToNum(%6)
-//   %10 : Long() = prim::Constant[value={1}]()  // if_block
-//   %alpha.1 : float = prim::TensorToNum(%10)
-//   %data.1 : Dynamic = aten::add(%a.1_data, %b_data, %alpha.1)
-//   %mask.1 : Dynamic = aten::mul(%a.1_mask, %b_mask)
-//   %dims.1 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-//   %15 : Long() = prim::Constant[value={1}]()  // else_block
-//   %alpha : float = prim::TensorToNum(%15)
-//   %data.4 : Dynamic = aten::sub(%a.1_data, %b_data, %alpha)
-//   %mask : Dynamic = aten::mul(%a.1_mask, %b_mask)
-//   %dims : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-//   %20 : Dynamic = aten::type_as(%7, %6)   // combine two outputs (batch_where)
-//   %cond_mask.1 : Dynamic = aten::mul(%6, %20)
-//   %22 : int = aten::dim(%cond_mask.1)
-//   %23 : int = prim::Constant[value=1]()
-//   %24 : int = aten::eq(%22, %23)
-//   %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%24)
-//     block0() {
-//       %28 : int = aten::dim(%data.1)
-//       %29 : int = prim::Constant[value=1]()
-//       %30 : int = aten::sub(%28, %29)
-//       %31 : int = prim::Constant[value=1]()
-//       %data.3 : Dynamic = prim::Loop(%30, %31, %cond_mask.1)
-//         block0(%_ : int, %34 : Dynamic) {
-//           %35 : int = prim::Constant[value=1]()
-//           %36 : int = aten::neg(%35)
-//           %data.2 : Dynamic = aten::unsqueeze(%34, %36)
-//           %38 : int = prim::Constant[value=1]()
-//           -> (%38, %data.2)
-//         }
-//       %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
-//       %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask.1)
-//       -> (%cond_data.1, %cond_mask.2, %data.3)
-//     }
-//     block1() {
-//       -> (%cond_mask.1, %cond_mask.1, %cond_mask.1)
-//     }
-//   %res_data : Dynamic = aten::where(%cond_data, %data.1, %data.4)
-//   %res_mask : Dynamic = aten::where(%cond_mask, %mask.1, %mask)
-//   %res_dims : Dynamic = aten::__or__(%dims.1, %dims)
-//   return (%res_data, %res_mask, %res_dims);
-// }
-void ToBatch::visitIf(Node* n, Block* block, Block* res_block){
-  toBatch(n->blocks()[0], res_block);
-  toBatch(n->blocks()[1], res_block);
-
-  // combine results from two if paths
-  for(size_t i = 0; i < n->outputs().size(); i++){
-    std::vector<Value*> inputs;
-    if(batch_map.find(n->input()) == batch_map.end()){  // cond is scalar
-      inputs.push_back(rn_env.at(n->input()));
-    }
-    else{  // cond is tensor
-      auto cond = batch_map.at(n->input());
-      inputs.insert(inputs.end(), cond.begin(), cond.end());
-    }
-    auto if_output = batch_map.at(n->blocks()[0]->outputs()[i]);
-    inputs.insert(inputs.end(), if_output.begin(), if_output.end());
-    auto else_output = batch_map.at(n->blocks()[1]->outputs()[i]);
-    inputs.insert(inputs.end(), else_output.begin(), else_output.end());
-    auto outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("where", inputs.size()), inputs);
-    batch_map[n->outputs()[i]] = outputs;
-  }
-}
-
-// prim::Loop transformation:
-//
-// transformation example:
-// @torch.jit.batch(batch_size=4)
-// def batch_while(a, b):
-//     while a > b:
-//         a -= b
-//     return a
-//
-// original graph:
-// graph(%a.1 : Dynamic
-//       %b : Dynamic) {
-//   %2 : int = prim::Constant[value={2147483647}]()
-//   %3 : Dynamic = aten::gt(%a.1, %b)
-//   %a : Dynamic = prim::Loop(%2, %3, %a.1)
-//     block0(%4 : Dynamic, %5 : Dynamic) {
-//       %a.2 : Dynamic = aten::sub[alpha={1}](%5, %b)
-//       %9 : Dynamic = aten::gt(%a.2, %b)
-//       -> (%9, %a.2)
-//     }
-//   return (%a);
-// }
-//
-// transformed graph:
-// graph(%a.1_data : Dynamic
-//       %a.1_mask : Dynamic
-//       %a.1_dims : Dynamic
-//       %b_data : Dynamic
-//       %b_mask : Dynamic
-//       %b_dims : Dynamic) {
-//   %6 : int = prim::Constant[value=2147483647]()
-//   %7 : Dynamic = aten::gt(%a.1_data, %b_data)
-//   %8 : Dynamic = aten::mul(%a.1_mask, %b_mask)
-//   %9 : Dynamic = aten::__or__(%a.1_dims, %b_dims)
-//   %10 : int = prim::TensorToNum(%7)
-//   %11 : Dynamic = aten::mul(%7, %8)
-//   %12 : Dynamic = aten::sum(%11)
-//   %13 : Dynamic = aten::gt[other={0}](%12)  // cond_any
-//   %14 : int = prim::TensorToNum(%13)
-//   %62 : Dynamic, %63 : Dynamic, %64 : Dynamic, %a : Dynamic, %60 : Dynamic, %61 : Dynamic = prim::Loop(%6, %14, %7, %8, %9, %a.1_data, %a.1_mask, %a.1_dims)
-//     block0(%loop_num : int, %cond_data.2 : Dynamic, %cond_mask.3 : Dynamic, %cond_dims : Dynamic, %6_data : Dynamic, %6_mask : Dynamic, %6_dims : Dynamic) {
-//       %23 : Long() = prim::Constant[value={1}]()
-//       %alpha : float = prim::TensorToNum(%23)
-//       %data.1 : Dynamic = aten::sub(%6_data, %b_data, %alpha)
-//       %mask : Dynamic = aten::mul(%6_mask, %b_mask)
-//       %dims : Dynamic = aten::__or__(%6_dims, %b_dims)
-//       %28 : Dynamic = aten::gt(%data.1, %b_data)
-//       %29 : Dynamic = aten::mul(%mask, %b_mask)
-//       %30 : Dynamic = aten::__or__(%dims, %b_dims)
-//       %31 : int = prim::TensorToNum(%28)
-//       %32 : Dynamic = aten::type_as(%cond_mask.3, %cond_data.2)  // update outputs (batch_where)
-//       %cond_mask.1 : Dynamic = aten::mul(%cond_data.2, %32)
-//       %34 : int = aten::dim(%cond_mask.1)
-//       %35 : int = prim::Constant[value=1]()
-//       %36 : int = aten::eq(%34, %35)
-//       %cond_data : Dynamic, %cond_mask : Dynamic, %data : Dynamic = prim::If(%36)
-//         block0() {
-//           %40 : int = aten::dim(%data.1)
-//           %41 : int = prim::Constant[value=1]()
-//           %42 : int = aten::sub(%40, %41)
-//           %43 : int = prim::Constant[value=1]()
-//           %data.3 : Dynamic = prim::Loop(%42, %43, %cond_mask.1)
-//             block0(%_ : int, %46 : Dynamic) {
-//               %47 : int = prim::Constant[value=1]()
-//               %48 : int = aten::neg(%47)
-//               %data.2 : Dynamic = aten::unsqueeze(%46, %48)
-//               %50 : int = prim::Constant[value=1]()
-//               -> (%50, %data.2)
-//             }
-//           %cond_data.1 : Dynamic = aten::expand_as(%data.3, %data.1)
-//           %cond_mask.2 : Dynamic = aten::expand_as(%data.3, %mask)
-//           -> (%cond_data.1, %cond_mask.2, %data.3)
-//         }
-//         block1() {
-//           -> (%cond_mask.1, %cond_mask.1, %cond_mask.1)
-//         }
-//       %res_data : Dynamic = aten::where(%cond_data, %data.1, %6_data)
-//       %res_mask : Dynamic = aten::where(%cond_mask, %mask, %6_mask)
-//       %res_dims : Dynamic = aten::__or__(%dims, %6_dims)
-//       %56 : Dynamic = aten::mul(%28, %29)
-//       %57 : Dynamic = aten::sum(%56)
-//       %58 : Dynamic = aten::gt[other={0}](%57)
-//       %59 : int = prim::TensorToNum(%58)
-//       -> (%59, %28, %29, %30, %res_data, %res_mask, %res_dims)
-//     }
-//   return (%a, %60, %61);
-// }
-void ToBatch::visitLoop(Node* n, Block* block, Block* res_block){
-  auto res_graph = res_block->owningGraph();
-  // bool cond_is_tensor indicates whether cond is tensor
-  // cond_is_tensor = false, eg: for loop, n->inputs()[1] = byte()
-  // cond_is_tensor = true, eg: in some while loop, cond is a batched tensor,
-  //                            we need to add expanded cond to the inputs of loop node and block,
-  //                            and compute cond_any as cond for while loop
-  bool cond_is_tensor = (batch_map.find(n->inputs()[1]) != batch_map.end());
-
-  // create prim::Loop node for res_block
-
-  // type of cond in loop should be int type
-  if(rn_env.at(n->inputs()[0])->type() != IntType::get()){
-    auto to_int_node = res_graph->createTensorToNum(IntType::get(), rn_env.at(n->inputs()[0]));
-    res_graph->insertNode(to_int_node);
-    rn_env[n->inputs()[0]] = to_int_node->output();
-  }
-  if(cond_is_tensor){
-    auto cond = batch_map.at(n->inputs()[1]);
-    auto cond_any = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("any"), cond);
-    auto to_int_node = res_graph->createTensorToNum(IntType::get(), cond_any[0]);
-    res_graph->insertNode(to_int_node);
-    rn_env[n->inputs()[1]] = to_int_node->output();
-  }
-  for(size_t i = 2; i < n->inputs().size(); i++){
-    auto input = n->inputs()[i];
-    rn_env[input] = batch_map.at(input)[0];
-  }
-  auto* r_node = res_graph->createClone(n, rn_fn, /*copy_blocks=*/false);
-
-  // change inputs of prim::Loop
-  if(cond_is_tensor){
-    for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){
-      auto cond = batch_map.at(n->inputs()[1]);
-      r_node->insertInput(i + 2, cond[i]);
-    }
-  }
-  for(size_t i = 2; i < n->inputs().size(); i++){
-    for(size_t j = 1; j < EXP_BTENSOR_SIZE; j++){
-      r_node->insertInput((i - 2) * EXP_BTENSOR_SIZE + EXP_BTENSOR_SIZE * cond_is_tensor + 2 + j, batch_map.at(n->inputs()[i])[j]);
-    }
-  }
-  r_node->setStage(n->stage());
-  res_block->appendNode(r_node);
-
-  // create block for Loop node in res_block
-  // if cond is tensor:    first 4 inputs of block: cond_any, cond_data, cond_mask, cond_dims
-  // if cond is not tensor: first 1 input of block: cond
-  auto loop_block = r_node->addBlock();
-
-  // add inputs
-  loop_block->addInput("loop_num");
-  loop_block->inputs()[0]->setType(IntType::get());
-  rn_env[n->blocks()[0]->inputs()[0]] = loop_block->inputs()[0];
-  if(cond_is_tensor){
-    for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){
-      loop_block->addInput("cond_" + EXP_BTENSOR_NAME[i]);
-    }
-  }
-  for(size_t i = 1; i < n->blocks()[0]->inputs().size(); i++){
-    auto input = n->blocks()[0]->inputs()[i];
-    auto name = input->uniqueName();
-    for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){
-      loop_block->addInput(name + "_" + EXP_BTENSOR_NAME[j]);
-    }
-    batch_map[input] = std::vector<Value*>(loop_block->inputs().slice((i - 1) * EXP_BTENSOR_SIZE + 1 + EXP_BTENSOR_SIZE * cond_is_tensor, EXP_BTENSOR_SIZE).vec());
-  }
-
-  toBatch(n->blocks()[0], loop_block);
-
-  WithInsertPoint guard(loop_block);
-
-  // use where operator to update variables and add to outputs
-  for(size_t i = 0; i < n->outputs().size(); i++){
-    std::vector<Value*> inputs, outputs;
-    if(cond_is_tensor){
-      for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){
-        inputs.push_back(loop_block->inputs()[j + 1]);
-      }
-      auto data = batch_map.at(n->blocks()[0]->outputs()[i + 1]);
-      inputs.insert(inputs.end(), data.begin(), data.end());
-      for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){
-        inputs.push_back(loop_block->inputs()[i * EXP_BTENSOR_SIZE + j + EXP_BTENSOR_SIZE + 1]);
-      }
-      outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("where"), inputs);
-    }
-    else{
-      for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){
-        inputs.push_back(loop_block->inputs()[i * EXP_BTENSOR_SIZE + j + 1]);
-      }
-      auto data = batch_map.at(n->blocks()[0]->outputs()[i + 1]);
-      inputs.insert(inputs.end(), data.begin(), data.end());
-      outputs = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("update"), inputs);
-    }
-    batch_map[n->outputs()[i]] = outputs;
-    for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){
-      loop_block->registerOutput(outputs[j]);
-    }
-  }
-
-  // update loop conditions
-  if(cond_is_tensor){
-    auto cond = batch_map.at(n->blocks()[0]->outputs()[0]);
-    auto cond_any = script::inlineCallTo(*res_block->owningGraph(), *getBatchOperator("any"), cond);
-    auto to_int_node = res_graph->createTensorToNum(IntType::get(), cond_any[0]);
-    res_graph->insertNode(to_int_node);
-    loop_block->insertOutput(0, to_int_node->output());
-    for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){
-      loop_block->insertOutput(i + 1, cond[i]);
-    }
-  }
-  else{
-    auto cond = rn_env.at(n->blocks()[0]->outputs()[0]);
-    loop_block->insertOutput(0, cond);
-  }
-
-  // change outputs of prim::Loop
-  auto size = r_node->outputs().size();
-  for(size_t i = 0; i < size; i++){
-    for(size_t j = 1; j < EXP_BTENSOR_SIZE; j++){
-      r_node->insertOutput(i * EXP_BTENSOR_SIZE + j);
-    }
-    batch_map[n->outputs()[i]] = r_node->outputs().slice(i * EXP_BTENSOR_SIZE, EXP_BTENSOR_SIZE).vec();
-  }
-  // add cond to outputs of loop node
-  if(cond_is_tensor){
-    for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){
-      r_node->insertOutput(i);
-    }
-  }
-}
+std::unordered_map<std::string, std::shared_ptr<Graph>> ToBatch::batch_operator_table;
 
 void ToBatch::toBatch(Block* block, Block* res_block) {
-  WithInsertPoint guard(res_block);
-
-  // change inputs of block - expand tensor to batchtensor eg: (data, mask, dims)
-  // eg: a -> a_data, a_mask, a_dims
-  // for block in prim::Loop, register inputs separately to deal with cond
-  if(!block->owningNode() || block->owningNode()->kind() != prim::Loop){
-    auto size = block->inputs().size();
-    for(size_t i = 0; i < size; i++){
-      auto input = block->inputs()[i];
-      auto name = input->uniqueName();
-      for(size_t j = 0; j < EXP_BTENSOR_SIZE; j++){
-        res_block->addInput(name + "_" + EXP_BTENSOR_NAME[j]);
-      }
-      batch_map[input] = std::vector<Value*>(res_block->inputs().slice(i * EXP_BTENSOR_SIZE, EXP_BTENSOR_SIZE).vec());
-    }
+  // change inputs of a graph - expand tensor to {data, mask, dims}
+  auto size = block->inputs().size();
+  for(size_t i = 0; i < size; i++){
+    auto input = block->inputs()[i];
+    auto name = input->uniqueName();
+    res_block->addInput(name + "_data");
+    res_block->addInput(name + "_mask");
+    res_block->addInput(name + "_dims");
+    batch_map[input] = std::vector<Value*>(res_block->inputs().slice(i * 3, 3));
   }
 
   for (auto it = block->nodes().begin(); it != block->nodes().end(); it++) {
     auto n = *it;
+    // replace tensor operator to BatchTensor operator
     if(n->kind().is_aten()){
-      visitAten(n, block, res_block);
-    }
-    else if(n->kind().is_prim()){
-      switch(n->kind()){
-        case prim::Constant:
-          visitConstant(n, block, res_block);
-          break;
-        case prim::NumToTensor:
-          visitNumToTensor(n, block, res_block);
-          break;
-        case prim::TensorToNum:
-          visitTensorToNum(n, block, res_block);
-          break;
-        case prim::ListConstruct:
-          visitListConstruct(n, block, res_block);
-          break;
-        case prim::If:
-          visitIf(n, block, res_block);
-          break;
-        case prim::Loop:
-          visitLoop(n, block, res_block);
-          break;
-        default:
-          throw std::runtime_error("NYI: node of prim kind other than [Constant, NumToTensor, TensorToNum, If, Loop] is not supported yet");
+      auto batch_graph = batch_operator_table.at(n->kind().toUnqualString());
+      WithInsertPoint guard(res_block);
+      std::vector<Value*> new_inputs;
+      for(Value *input : n->inputs()){
+        if(batch_map.find(input) != batch_map.end()){
+          auto new_input = batch_map.at(input);
+          new_inputs.insert(new_inputs.end(), new_input.begin(), new_input.end());
+        }
+        else{
+          throw std::runtime_error("NYI: non-tensor input for aten operator is not supported yet");
+        }
+      }
+      auto outputs = script::inlineCallTo(*res_block->owningGraph(), *batch_graph, new_inputs);
+      // Assume all outputs from inlined operator implementation are in the triple form.
+      for(size_t i = 0; i < n->outputs().size(); i++){
+        auto output = n->outputs()[i];
+        batch_map[output] = std::vector<Value*>(outputs.begin() + i * 3, outputs.begin() + i * 3 + 3);
       }
     }
-    else{
-      throw std::runtime_error("NYI: node that is not aten or prim kind is not supported yet");
+    else if(n->kind().is_prim()){
+      throw std::runtime_error("NYI: node of prim kind is not supported to transform to batch graph yet");
     }
   }
-  // change outputs of block - expand tensor to batchtensor(data, mask, dims)
-  // for block in prim::Loop, register outputs separately to deal with cond and cond_any
-  // for block in prim::If, register outputs separately by combining outputs from two paths and return
-  if(!block->owningNode() || (block->owningNode()->kind() != prim::Loop && block->owningNode()->kind() != prim::If)) {
-    for(Value* output : block->outputs()){
-      auto r_output = batch_map.at(output);
-      for(size_t i = 0; i < EXP_BTENSOR_SIZE; i++){
-        res_block->registerOutput(r_output[i]);
-      }
-    }
+  // change outputs of a graph - expand tensor to {data, mask, dims}
+  for(Value* output : block->outputs()){
+    auto r_output = batch_map.at(output);
+    res_block->registerOutput(r_output[0]);
+    res_block->registerOutput(r_output[1]);
+    res_block->registerOutput(r_output[2]);
   }
 }
 
 std::shared_ptr<Graph> to_batch_graph(std::shared_ptr<Graph>& graph){
   // std::cout<<graph->toString()<<std::endl;
-  std::shared_ptr<Graph> res_graph = std::make_shared<Graph>(graph->scope_root());
+  auto res_graph = std::make_shared<Graph>(graph->scope_root());
   ToBatch to_batch;
   to_batch.toBatch(graph->block(), res_graph->block());
   // std::cout<<res_graph->toString()<<std::endl;
@@ -537,7 +66,7 @@ void initRegisterBatchOpsBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
   m.def("to_batch_graph", &to_batch_graph);
   m.def("register_batch_operator", [](std::string name, std::shared_ptr<Graph> graph){
-    ToBatch::batch_operator_table[name].push_back(graph);
+    ToBatch::batch_operator_table[name] = graph;
   });
 }
 
diff --git a/torch/csrc/jit/passes/to_batch.h b/torch/csrc/jit/passes/to_batch.h
index 6545e2a2d4f8ed..23c23a0632b310 100644
--- a/torch/csrc/jit/passes/to_batch.h
+++ b/torch/csrc/jit/passes/to_batch.h
@@ -3,33 +3,14 @@
 #include "torch/csrc/jit/pybind.h"
 #include "torch/csrc/jit/ir.h"
 
-#include <ATen/ATen.h>
-
 namespace torch { namespace jit {
 
 class ToBatch {
 private:
-  // number of tensors to represent a expanded BatchTensor. {data, mask, dims} for now.
-  const size_t EXP_BTENSOR_SIZE = 3;
-  const std::vector<std::string> EXP_BTENSOR_NAME = {"data", "mask", "dims"};
   // mapping from tensor in original graph to {data, mask, dims} in new graph
   std::unordered_map<Value*, std::vector<Value*>> batch_map;
-  // mapping from input in original graph to new input in new graph - used in createClone
-  std::unordered_map<Value*, Value*> rn_env;
-  std::function<Value*(Value*)> rn_fn = [this](Value* v) { return rn_env.at(v); };
-
-private:
-  std::shared_ptr<Graph> getBatchOperator(std::string name, int64_t input_num = -1);
-  void visitAten(Node* n, Block* block, Block* res_block);
-  void visitConstant(Node* n, Block* block, Block* res_block);
-  void visitNumToTensor(Node* n, Block* block, Block* res_block);
-  void visitTensorToNum(Node* n, Block* block, Block* res_block);
-  void visitListConstruct(Node* n, Block* block, Block* res_block);
-  void visitIf(Node* n, Block* block, Block* res_block);
-  void visitLoop(Node* n, Block* block, Block* res_block);
-
 public:
-  static std::unordered_map<std::string, std::vector<std::shared_ptr<Graph>>> batch_operator_table;
+  static std::unordered_map<std::string, std::shared_ptr<Graph>> batch_operator_table;
   TORCH_API void toBatch(Block* block, Block* res_block);
 };
 
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index 0598e651d32437..415fc311086ac9 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -4,70 +4,26 @@
 
 namespace torch { namespace jit {
 
-inline Stack createStack(const py::tuple& tuple, at::ArrayRef<Value*> inputs, size_t reserve_extra_space = 0) {
-  if (tuple.size() != inputs.size()) {
-    throw std::runtime_error("expected " + std::to_string(inputs.size()) +
-                             " inputs, but got " + std::to_string(tuple.size()));
-  }
-  static const auto castToIValue = [](const py::object& obj, Type& t) -> IValue{
-    switch (t.kind()) {
-      case TypeKind::DynamicType:
-      case TypeKind::TensorType:
-        return py::cast<autograd::Variable>(obj);
-      case TypeKind::FloatType:
-        return py::cast<double>(obj);
-      case TypeKind::IntType:
-        return py::cast<int64_t>(obj);
-      case TypeKind::NoneType:
-        return {};
-      case TypeKind::ListType:
-      case TypeKind::TupleType:
-        throw std::runtime_error("Lists and tuples are not supported yet");
-      case TypeKind::NumberType:
-        throw std::runtime_error("Insufficient type information to convert input");
-    }
-    throw std::runtime_error("Missing cases in castToIValue! File a bug report.");
-  };
+inline Stack createStack(const py::tuple& tuple, size_t reserve_extra_space = 0) {
   Stack result;
   result.reserve(tuple.size() + reserve_extra_space);
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    result.push_back(castToIValue(tuple[i], *inputs[i]->type()));
+  for(auto e : tuple) {
+    result.push_back(py::cast<autograd::Variable>(e));
   }
   return result;
 }
 
-inline py::object wrapStack(Stack&& outputs, at::ArrayRef<Value*> output_vals) {
-  if (outputs.size() != output_vals.size()) {
-    throw std::runtime_error("expected " + std::to_string(output_vals.size()) +
-                             " outputs, but got " + std::to_string(outputs.size()));
-  }
-  static const auto createOutput = [](IValue && ivalue, Value * value) -> py::object {
-    switch (value->type()->kind()) {
-      case TypeKind::DynamicType:
-      case TypeKind::TensorType:
-        return py::cast(autograd::Variable(ivalue.toTensor()));
-      case TypeKind::FloatType:
-        return py::cast(ivalue.toDouble());
-      case TypeKind::IntType:
-        return py::cast(ivalue.toInt());
-      case TypeKind::NoneType:
-        return py::none();
-      case TypeKind::ListType:
-      case TypeKind::TupleType:
-        throw std::runtime_error("Lists and tuples are not supported yet");
-      case TypeKind::NumberType:
-        throw std::runtime_error("Insufficient type information to convert input");
-    }
-    throw std::runtime_error("Missing cases in createOutput! File a bug report.");
-  };
+inline py::object wrapStack(Stack&& outputs) {
   if (outputs.size() == 0) {
     return py::none();
   } else if (outputs.size() == 1) {
-    return createOutput(std::move(outputs[0]), output_vals[0]);
+    JIT_ASSERT(outputs[0].isTensor());
+    return py::cast(autograd::as_variable_ref(std::move(outputs[0]).toTensor()));
   } else {
     py::tuple tuple(outputs.size());
     for(size_t i = 0; i < outputs.size(); i++) {
-      tuple[i] = createOutput(std::move(outputs[i]), output_vals[i]);
+      JIT_ASSERT(outputs[i].isTensor());
+      tuple[i] = py::cast(autograd::as_variable_ref(std::move(outputs[i]).toTensor()));
     }
     return tuple;
   }
diff --git a/torch/csrc/jit/python_arg_flatten.h b/torch/csrc/jit/python_arg_flatten.h
index 3e1477e52e0701..b5139032fde169 100644
--- a/torch/csrc/jit/python_arg_flatten.h
+++ b/torch/csrc/jit/python_arg_flatten.h
@@ -14,7 +14,7 @@ namespace torch { namespace jit { namespace python {
 struct IODescriptor {
   struct VariableMetadata {
     VariableMetadata(const autograd::Variable& var)
-      : sizes(var.sizes().vec())
+      : sizes(var.sizes())
       , type(var.type().scalarType())
       , device(var.type().is_cuda() ? var.get_device() : -1)
       , requires_grad(var.requires_grad()) {}
@@ -104,7 +104,7 @@ struct ParsedArgs {
 
 
 ParsedArgs flatten(py::handle obj);
-PyObject* unflatten(at::ArrayRef<autograd::Variable> vars,
+PyObject* unflatten(at::ArrayRef<autograd::Variable> outputs,
                     const IODescriptor& structure);
 
 }}} // namespace torch::jit::python
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index b72fdb6b8860b1..81211085569953 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -451,22 +451,10 @@ void initPythonIRBindings(PyObject * module_) {
     .def("scalarType",[](Type& t) {
       return at::toString(t.expect<TensorType>()->scalarType());
     })
-    .def("__eq__", [](std::shared_ptr<Type>& self, std::shared_ptr<Type>& other) {
-		  return *self == *other;
-    })
-    .def("isSubtypeOf", [](std::shared_ptr<Type>& self, std::shared_ptr<Type> other) {
-        return self->isSubtypeOf(other);
-    });
+    ;
 
-  py::class_<NumberType, Type, std::shared_ptr<NumberType>>(m, "NumberType")
-    .def_static("get", &NumberType::get);
-  py::class_<IntType, Type, std::shared_ptr<IntType>>(m, "IntType")
-    .def_static("get", &IntType::get);
-  py::class_<FloatType, Type, std::shared_ptr<FloatType>>(m, "FloatType")
-    .def_static("get", &FloatType::get);
   py::class_<DynamicType, Type, std::shared_ptr<DynamicType>>(m, "DynamicType")
-    .def_static("get", &DynamicType::get);
-
+    .def(py::init([](){ return DynamicType::create(); }));
   py::class_<TupleType, Type, std::shared_ptr<TupleType>>(m, "TupleType")
     .def(py::init([](std::vector<TypePtr> a){ return TupleType::create(a); }))
     .def("elements", [](TupleType &self){
@@ -477,9 +465,7 @@ void initPythonIRBindings(PyObject * module_) {
       return types;
     });
   py::class_<ListType, Type, std::shared_ptr<ListType>>(m, "ListType")
-    .def_static("ofInts", &ListType::ofInts)
-    .def_static("ofTensors", &ListType::ofTensors)
-    .def("getElementType", &ListType::getElementType);
+    .def_static("ofInts", &ListType::ofInts);
 
   py::class_<Use>(m,"Use")
   .def_readonly("user",&Use::user)
diff --git a/torch/csrc/jit/python_tracer.cpp b/torch/csrc/jit/python_tracer.cpp
index 0496af67412654..7439b2b5e334cc 100644
--- a/torch/csrc/jit/python_tracer.cpp
+++ b/torch/csrc/jit/python_tracer.cpp
@@ -103,10 +103,10 @@ void pythonRecordSourceLocation(Node* n) {
   n->setSourceLocation(sl);
 }
 
-void initPythonTracerBindings(PyObject* module) {
+void initPythonTracerBindings(PyObject* module_) {
   setRecordSourceLocation(pythonRecordSourceLocation);
 
-  auto m = py::handle(module).cast<py::module>();
+  auto m = py::handle(module_).cast<py::module>();
   py::class_<TracingState,std::shared_ptr<TracingState>>(m, "TracingState", py::dynamic_attr())
     // NB: no constructor; you have to get it from C++ code
     .def("__repr__", [](const TracingState& s) {
diff --git a/torch/csrc/jit/register_prim_ops.cpp b/torch/csrc/jit/register_prim_ops.cpp
index f2b8ea18a2be24..8fe747e59900f0 100644
--- a/torch/csrc/jit/register_prim_ops.cpp
+++ b/torch/csrc/jit/register_prim_ops.cpp
@@ -231,18 +231,6 @@ RegisterOperators reg({
               push(stack, std::move(vals));
               return 0;
             };
-          } else if (lt->getElementType()->isSubtypeOf(DynamicType::get())) {
-            return [=](Stack& stack) {
-              const size_t stack_size = stack.size();
-              std::vector<at::Tensor> vals;
-              vals.reserve(num_inputs);
-              for (size_t i = stack_size - num_inputs; i < stack_size; ++i) {
-                vals.push_back(std::move(stack[i]).toTensor());
-              }
-              drop(stack, num_inputs);
-              push(stack, std::move(vals));
-              return 0;
-            };
           } else {
             std::stringstream ss;
             ss << "unsupported list type: " << *lt->getElementType();
@@ -347,35 +335,7 @@ RegisterOperators reg2({
             return 0;
           };
         }),
-    Operator(
-        "aten::_tensor_to_list(Tensor a) -> int[]",
-        [](Node* node) {
-          return [=](Stack& stack) {
-            at::Tensor t;
-            pop(stack, t);
-            std::vector<int64_t> elems;
-            for(int i = 0; i < t.size(0); i++){
-              elems.push_back(*t[i].toIntData());
-            }
-            push(stack, jit::IntList::create(elems));
-            return 0;
-          };
-        }),
-    Operator(
-        "aten::_list_to_tensor(int[] a) -> Tensor",
-        [](Node* node) {
-          return [=](Stack& stack) {
-            std::vector<int64_t> l;
-            pop(stack, l);
-            auto t = torch::empty(
-                {static_cast<int64_t>(l.size())}, at::dtype(at::kInt));
-            for(size_t i = 0; i < l.size(); i++){
-              t[i] = l[i];
-            }
-            push(stack, t);
-            return 0;
-          };
-        }),
+
     // commutative
     DEFINE_ST_OP(mul, at::mul(b, a))
     DEFINE_ST_OP(add, at::add(b, a))
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 4f27cb25b53cb7..0016f69b5ce07b 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -351,19 +351,37 @@ Value* createNumber(Graph& g, const SourceRange& loc, const at::Tensor& val) {
   return output;
 }
 
+Value* createStack(Graph& g, const SourceRange& loc, at::ArrayRef<Value*> inputs) {
+  // bake in constant propagation for the all-constant case because it is
+  // common to see constant lists like [1, 2] passed to attributes
+  bool all_constant = std::all_of(inputs.begin(), inputs.end(), [&](Value* v) {
+    return v->node()->kind() == prim::Constant;
+  });
+  if(all_constant) {
+    auto values = fmap(inputs, [&](Value* v) {
+      return v->node()->t(attr::value);
+    });
+    return insertConstant(g, at::stack(values), loc);
+  }
+  return g.insertNode(g.create(aten::stack, inputs)
+                      ->i_(attr::dim, 0)
+                      ->setSourceLocation(std::make_shared<SourceRange>(loc)))->output();
+}
+
+static bool isTensorSubtype(Value* v) {
+  return v->type()->isSubtypeOf(DynamicType::get());
+}
+
 at::optional<std::vector<int64_t>> getIntListAttribute(at::optional<int32_t> N, Value* input) {
   auto list = constant_as<Shared<jit::IntList>>(input);
   if(list)
-    return list.value()->elements().vec();
-
+    return std::vector<int64_t>(list.value()->elements());
   // broadcast IntList[3] with value 4 -> {4, 4, 4}
   if(!N)
     return at::nullopt;
-
   auto r = constant_as<int64_t>(input);
   if(!r)
     return at::nullopt;
-
   // broadcast to attribute size
   return std::vector<int64_t>(*N, *r);
 }
@@ -437,46 +455,51 @@ at::optional<std::vector<Value*>> tryMatchSchema(
     }
 
     // check input types
-    std::vector<Value*> matched_inputs;
+    std::vector<Value*> flat_inputs;
     for(size_t i = 0; i < schema.arguments.size(); ++i) {
-      Value* value = positional_inputs[i]->value;
+      NamedValue v = *positional_inputs[i];
       const auto& arg = schema.arguments[i];
 
       // some functions that take lists of integers for fixed size arrays
       // also allow single ints to be passed in their place.
       // the single int is then repeated to the length of the list
-      if (isIntUsedAsIntList(value, arg)) {
-        std::vector<Value*> repeated(*arg.N, value);
-        value = graph.insertNode(graph.createList(IntType::get(), repeated))->output();
+      if (isIntUsedAsIntList(v.value, arg)) {
+        std::vector<Value*> repeated(*arg.N, v.value);
+        v.value = graph.insertNode(graph.createList(IntType::get(), repeated))->output();
       }
 
-      // Allow homogeneous tuples to be casted implicitly to lists of appropriate types
-      if (arg.type->kind() == TypeKind::ListType &&
-          value->type()->kind() == TypeKind::TupleType &&
-          value->type()->isSubtypeOf(arg.type)) {
-        auto unpacked = createTupleUnpack(value);
-        auto elem_type = arg.type->expect<ListType>()->getElementType();
-        value = graph.insertNode(graph.createList(elem_type, unpacked))->output();
+      // Allow tuples that only contain integers to turn into lists of integers
+      if(*ListType::ofInts() == *arg.type &&
+         v.value->type()->kind() == TypeKind::TupleType &&
+         v.value->type()->isSubtypeOf(ListType::ofInts())) {
+        auto unpacked = createTupleUnpack(v.value);
+        v.value = graph.insertNode(graph.createList(IntType::get(), unpacked))->output();
       }
 
-      if (value->node()->kind() == prim::None){
+      if (v.value->node()->kind() == prim::None){
         if (arg.type->isSubtypeOf(NumberType::get()))
-          value = insertConstant(graph, at::Scalar(NAN), loc);
+          v.value = insertConstant(graph, at::Scalar(NAN), loc);
         else
-          value = graph.insertNode(graph.createUndefined())->output();
+          v.value = graph.insertNode(graph.createUndefined())->output();
       }
 
-      if(!value->type()->isSubtypeOf(arg.type)) {
+      if(!v.value->type()->isSubtypeOf(arg.type)) {
         err() << "expected a value of type " << arg.type->str() << " for argument '" << arg.name << "' but found "
-              << value->type()->str() << "\n"
-              << positional_inputs[i]->loc;
+              << v.value->type()->str() << "\n"
+              << v.loc;
         return at::nullopt;
       }
 
-      matched_inputs.push_back(value);
+      // we only support tensor lists for builtins, where they must be flattened
+      if(arg.type->isSubtypeOf(ListType::ofTensors())) {
+        auto outputs = createTupleUnpack(v.value);
+        flat_inputs.insert(flat_inputs.end(), outputs.begin(), outputs.end());
+      } else {
+        flat_inputs.push_back(v.value);
+      }
     }
 
-    return matched_inputs;
+    return flat_inputs;
 }
 
 
@@ -490,27 +513,27 @@ static std::shared_ptr<SugaredValue> tryEmitBuiltin(
   at::ArrayRef<NamedValue> attributes) {
 
   auto graph = method.graph();
-  auto matched_inputs = tryMatchSchema(op->schema(), loc, *graph, inputs, attributes, failure_messages);
-  if(!matched_inputs)
+  auto flat_inputs = tryMatchSchema(op->schema, loc, *graph, inputs, attributes, failure_messages);
+  if(!flat_inputs)
     return nullptr;
   // we successfully matched this schema, construct the node
 
   NodeKind kind(Symbol::aten(name));
-  auto n = graph->insertNode(graph->create(kind, *matched_inputs, 0))
+  auto n = graph->insertNode(graph->create(kind, *flat_inputs, 0))
                 ->setSourceLocation(std::make_shared<SourceRange>(loc));
 
   // special case for chunk when the chunks=<const> is known
   // DO NOT ADD MORE SPECIAL CASES HERE, REFACTOR INTO A FUNCTION IF
   // NEEDED
   if(n->kind() == aten::chunk) {
-    auto value = constant_as<int64_t>((*matched_inputs)[1]);
+    auto value = constant_as<int64_t>((*flat_inputs)[1]);
     if(!value) {
       throw ErrorReport(loc) << "argument 'chunks' must be a constant";
     }
     for(int64_t i = 0; i < *value; ++i)
       n->addOutput();
   } else {
-    for(auto & ret : op->schema().returns) {
+    for(auto & ret : op->schema.returns) {
       n->addOutput()->setType(ret.type);
     }
   }
@@ -565,7 +588,7 @@ std::shared_ptr<SugaredValue> emitBuiltinCall(
 }
 
 static Value* ensureTensor(const SourceRange& range, Value* v) {
-  if(!v->type()->isSubtypeOf(DynamicType::get())) {
+  if(!isTensorSubtype(v)) {
     throw ErrorReport(range) << "expected a tensor value but found a "
                              << v->type()->str();
   }
@@ -677,7 +700,7 @@ struct to_ir {
       if (return_stmt.values().size() == 1 && results.size() == 1) {
         auto result = results.at(0);
         if(result->type()->cast<TupleType>()) {
-          results = createTupleUnpack(result).vec();
+          results = createTupleUnpack(result);
         }
       }
       if (typed_def.schema && typed_def.schema->returns.size() != results.size()) {
@@ -688,16 +711,12 @@ struct to_ir {
       auto range = return_stmt.range();
       size_t return_type_idx = 0;
       for (auto& r : results) {
-        // TODO: support tuples and lists as returns
-        auto return_kind = r->type()->kind();
-        if (return_kind != TypeKind::TensorType &&
-            return_kind != TypeKind::DynamicType &&
-            return_kind != TypeKind::IntType &&
-            return_kind != TypeKind::FloatType) {
-          throw ErrorReport(return_stmt.range()) << "The only supported return types "
-            << "are tensors, ints and floats";
+        if(r->type()->isSubtypeOf(NumberType::get())) {
+          graph->registerOutput(numToTensor(range, r));
+        } else {
+          ensureTensor(range, r);
+          graph->registerOutput(r);
         }
-        graph->registerOutput(r);
         TypePtr type = DynamicType::get();
         if (typed_def.schema) {
           type = typed_def.schema->returns.at(return_type_idx).type;
@@ -1368,11 +1387,6 @@ struct to_ir {
         auto values = getValues(ll.inputs(), /*maybe_unpack=*/true, identity);
         return graph->insertNode(graph->createTuple(values))->output();
       } break;
-      case TK_TUPLE_LITERAL: {
-        auto ll = TupleLiteral(tree);
-        auto values = getValues(ll.inputs(), /*maybe_unpack=*/true, identity);
-        return graph->insertNode(graph->createTuple(values))->output();
-      } break;
       default:
         throw ErrorReport(tree) << "NYI: " << tree;
         break;
diff --git a/torch/csrc/jit/script/compiler.h b/torch/csrc/jit/script/compiler.h
index 3c4dcb07a248ee..0b87cf56be6ad3 100644
--- a/torch/csrc/jit/script/compiler.h
+++ b/torch/csrc/jit/script/compiler.h
@@ -68,7 +68,7 @@ struct SugaredValue : public std::enable_shared_from_this<SugaredValue> {
     SourceRange loc,
     Method & m,
     // note: names for args will be 'argument 0', 'argument 1', etc..
-    at::ArrayRef<NamedValue> inputs_,
+    at::ArrayRef<NamedValue> inputs,
     at::ArrayRef<NamedValue> attributes,
     size_t n_binders) {
 // n_binders is always set to the number of variables an expression is
@@ -89,7 +89,7 @@ struct SugaredValue : public std::enable_shared_from_this<SugaredValue> {
     throw ErrorReport(loc) << "cannot call a " << kind();
   }
 
-  virtual ~SugaredValue() = default;
+  virtual ~SugaredValue() {}
 };
 
 // most things in the environment are just simple value types
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index 39bb51ed89ca5d..cb7893234dc747 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -370,15 +370,10 @@ static void gatherParametersAndBuffers(std::vector<at::Tensor*> & values, const
   }
 }
 
-Stack createStack(const py::tuple& tuple, const Method& method) {
-  auto relevant_inputs = method.graph()->inputs().slice(0, method.num_inputs());
-  return createStack(tuple, relevant_inputs);
-}
-
 py::object runMethodFromPython(Method& m, py::args args) {
-  auto stack = createStack(args, m);
+  auto stack = createStack(args);
   m.run(stack);
-  return wrapStack(std::move(stack), m.graph()->outputs());
+  return wrapStack(std::move(stack));
 }
 
 void initJitScriptBindings(PyObject* module) {
@@ -507,8 +502,7 @@ void initJitScriptBindings(PyObject* module) {
       })
       .def("graph_for", [](Module& self, py::args args) {
         if (self.find_method("forward")) {
-          Method & m = self.get_method("forward");
-          return m.graph_for(createStack(args, m.graph()->inputs()));
+          return self.get_method("forward").graph_for(createStack(args));
         }
         throw std::runtime_error("Attempted to call graph_for on a Module without a compiled forward()");
       })
@@ -536,7 +530,7 @@ void initJitScriptBindings(PyObject* module) {
     .def("propagate_and_assign_input_and_output_shapes", &Method::propagate_and_assign_input_and_output_shapes)
     .def("params", &Method::params)
     .def("graph_for", [](Method& self, py::args args) {
-      return self.graph_for(createStack(args, self.graph()->inputs()));
+      return self.graph_for(createStack(args));
     })
     .def("set_arg_and_return_types", [](Method &self, TypedDef &typed_def, bool method) {
       std::vector<Argument> arg_type_args, return_type_args;
diff --git a/torch/csrc/jit/script/lexer.h b/torch/csrc/jit/script/lexer.h
index 1694889d630d39..912b488dde5d9e 100644
--- a/torch/csrc/jit/script/lexer.h
+++ b/torch/csrc/jit/script/lexer.h
@@ -75,7 +75,6 @@ namespace script {
   _(TK_GATHER, "gather", "")                     \
   _(TK_NOTHING, "nothing", "")                   \
   _(TK_LIST_LITERAL, "list-literal", "")         \
-  _(TK_TUPLE_LITERAL, "tuple-literal", "")       \
   _(TK_FOR, "for", "for")                        \
   _(TK_IN, "in", "in")                           \
   _(TK_STARRED, "starred", "")                   \
diff --git a/torch/csrc/jit/script/parser.h b/torch/csrc/jit/script/parser.h
index 0cd833dc15e488..abea2778053699 100644
--- a/torch/csrc/jit/script/parser.h
+++ b/torch/csrc/jit/script/parser.h
@@ -30,7 +30,7 @@ struct Parser {
         List<Attribute>(makeList(range, std::move(attributes))));
   }
   // exp | expr, | expr, expr, ...
-  TreeRef parseExpOrExpTuple(int end) {
+  TreeRef parseExpOrExpList(int end) {
     auto prefix = parseExp();
     if(L.cur().kind == ',') {
       std::vector<Expr> exprs = { prefix };
@@ -39,7 +39,7 @@ struct Parser {
         exprs.push_back(parseExp());
       }
       auto list = List<Expr>::create(prefix.range(), exprs);
-      prefix = TupleLiteral::create(list.range(), list);
+      prefix = ListLiteral::create(list.range(), list);
     }
     return prefix;
   }
@@ -61,14 +61,7 @@ struct Parser {
       } break;
       case '(': {
         L.next();
-        if (L.nextIf(')')) {
-          /// here we have the empty tuple case
-          std::vector<Expr> vecExpr;
-          List<Expr> listExpr = List<Expr>::create(L.cur().range, vecExpr);
-          prefix = TupleLiteral::create(L.cur().range, listExpr);
-          break;
-        }
-        prefix = parseExpOrExpTuple(')');
+        prefix = parseExpOrExpList(')');
         L.expect(')');
       } break;
       case '[': {
@@ -249,7 +242,7 @@ struct Parser {
   // first[,other,lhs] = rhs
   Assign parseAssign(List<Expr> list) {
     auto red = parseOptionalReduction();
-    auto rhs = parseExpOrExpTuple(TK_NEWLINE);
+    auto rhs = parseExpOrExpList(TK_NEWLINE);
     L.expect(TK_NEWLINE);
     return Assign::create(list.range(), list, AssignKind(red), Expr(rhs));
   }
diff --git a/torch/csrc/jit/script/python_tree_views.cpp b/torch/csrc/jit/script/python_tree_views.cpp
index 7ece5e055a33df..569d1b0e66fdf3 100644
--- a/torch/csrc/jit/script/python_tree_views.cpp
+++ b/torch/csrc/jit/script/python_tree_views.cpp
@@ -193,10 +193,6 @@ void initTreeViewBindings(PyObject *module) {
     .def(py::init([](const SourceRange& range, std::vector<Expr> args) {
       return ListLiteral::create(range, wrap_list(range, std::move(args)));
     }));
-  py::class_<TupleLiteral, Expr>(m, "TupleLiteral")
-    .def(py::init([](const SourceRange& range, std::vector<Expr> args) {
-      return TupleLiteral::create(range, wrap_list(range, std::move(args)));
-    }));
   py::class_<Gather, Expr>(m, "Gather")
     .def(py::init([](const Expr& base, const Expr& index) {
       return Gather::create(base.range(), base, index);
diff --git a/torch/csrc/jit/script/tree.h b/torch/csrc/jit/script/tree.h
index 0b9bc7009e0162..e3d69d2790682d 100644
--- a/torch/csrc/jit/script/tree.h
+++ b/torch/csrc/jit/script/tree.h
@@ -89,7 +89,7 @@ struct Tree : std::enable_shared_from_this<Tree> {
       throw std::runtime_error(ss.str());
     }
   }
-  virtual ~Tree() = default;
+  virtual ~Tree() {}
 
  private:
   int kind_;
diff --git a/torch/csrc/jit/script/tree_views.h b/torch/csrc/jit/script/tree_views.h
index 10ac01799c0607..6cc934ab4d177a 100644
--- a/torch/csrc/jit/script/tree_views.h
+++ b/torch/csrc/jit/script/tree_views.h
@@ -58,7 +58,6 @@ namespace script {
 //       | Gather(Expr value, Expr indices)                             TK_GATHER
 //       | Var(Ident name)                                              TK_VAR
 //       | ListLiteral(List<Expr> inputs)                               TK_LIST_LITERAL
-//       | TupleLiteral(List<Expr> inputs)                              TK_TUPLE_LITERAL
 //       | Starred(Expr expr)                                           TK_STARRED
 //
 // -- NB: only allowed expressions are Const or List(Const)
@@ -256,7 +255,6 @@ struct Expr : public TreeView {
       case TK_GATHER:
       case TK_VAR:
       case TK_LIST_LITERAL:
-      case TK_TUPLE_LITERAL:
       case '@':
       case TK_POW:
         return;
@@ -696,17 +694,6 @@ struct ListLiteral : public Expr {
   }
 };
 
-struct TupleLiteral : public Expr {
-  explicit TupleLiteral(const TreeRef& tree) : Expr(tree) {
-    tree_->match(TK_TUPLE_LITERAL);
-  }
-  List<Expr> inputs() const {
-    return subtree(0);
-  }
-  static TupleLiteral create(const SourceRange& range, const List<Expr>& inputs) {
-    return TupleLiteral(Compound::create(TK_TUPLE_LITERAL, range, {inputs}));
-  }
-};
 
 struct Starred : public Expr {
   explicit Starred(const TreeRef& tree) : Expr(tree) {
diff --git a/torch/csrc/jit/stack.h b/torch/csrc/jit/stack.h
index 7a23aa55df538f..2c74ae7e0a4c77 100644
--- a/torch/csrc/jit/stack.h
+++ b/torch/csrc/jit/stack.h
@@ -77,8 +77,8 @@ inline void pack(Stack & stack, T&& v) {
 }
 
 template<>
-inline void pack(Stack & stack, std::vector<at::Tensor>&& v) {
-  for(auto& t : v) {
+inline void pack(Stack & stack, std::vector<at::Tensor>&& ts) {
+  for(auto& t : ts) {
     stack.push_back(IValue(std::move(t)));
   }
 }
diff --git a/torch/csrc/jit/symbolic_variable.h b/torch/csrc/jit/symbolic_variable.h
index ef6d41005789f8..e4d2f98ba0ea0f 100644
--- a/torch/csrc/jit/symbolic_variable.h
+++ b/torch/csrc/jit/symbolic_variable.h
@@ -119,20 +119,18 @@ struct SymbolicVariable {
     return create(t("narrow"), { *this, insertConstant(dim), insertConstant(start), insertConstant(length) }, 1)[0];
   }
   static SymbolicVariable cat(ArrayRef<SymbolicVariable> inputs, Value* dim) {
-    Graph *g = dim->owningGraph();
-    auto value_inputs = fmap(inputs, [](const SymbolicVariable & v) { return v.value(); });
-    Value *input_list = g->insertNode(g->createList(DynamicType::get(), value_inputs))->output();
-    return create(aten::cat, {input_list, dim})[0];
+    std::vector<SymbolicVariable> all_inputs = inputs;
+    all_inputs.push_back(dim);
+    return create(aten::cat, all_inputs)[0];
   }
   static SymbolicVariable cat(ArrayRef<SymbolicVariable> inputs, int dim) {
     JIT_ASSERT(inputs.size() > 0);
     return SymbolicVariable::cat(inputs, inputs[0].insertConstant(dim));
   }
   static SymbolicVariable stack(ArrayRef<SymbolicVariable> inputs, Value* dim) {
-    Graph *g = dim->owningGraph();
-    auto value_inputs = fmap(inputs, [](const SymbolicVariable & v) { return v.value(); });
-    Value *input_list = g->insertNode(g->createList(DynamicType::get(), value_inputs))->output();
-    return create(aten::stack, {input_list, dim})[0];
+    std::vector<SymbolicVariable> all_inputs = inputs;
+    all_inputs.push_back(dim);
+    return create(aten::stack, all_inputs)[0];
   }
   static SymbolicVariable stack(ArrayRef<SymbolicVariable> inputs, int dim) {
     JIT_ASSERT(inputs.size() > 0);
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index d5d204f9465bd8..8c9763f88353e5 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -220,9 +220,6 @@ static void fusionTests() {
   testOne(1,2,0,2);
 
 
-  auto createFusedConcat = [](Graph & graph, at::ArrayRef<Value*> inputs, int64_t dim) -> Value* {
-    return graph.insertNode(graph.create(prim::FusedConcat, inputs)->i_(attr::dim, dim))->output();
-  };
 
   auto testConcat = [&](int dim) {
     Graph graph;
@@ -230,7 +227,7 @@ static void fusionTests() {
     Var i1 = Var::asNewInput(graph);
     auto o0 = i0 * i1;
     o0.addAsOutput();
-    Var(createFusedConcat(graph, {i0, o0}, dim)).addAsOutput();
+    Var::cat({i0, o0}, dim).addAsOutput();
 
     auto a = at::rand({3,4,5}, at::kCUDA);
     auto b = at::rand({4,3,5}, at::kCUDA).transpose(0,1);
@@ -779,9 +776,6 @@ void argumentSpecTest() {
   REQUIRE(!(c == a));
   REQUIRE(spec.count(c) == 0);
 
-  Stack stack = { var(CF, {1,2}, true), 3, var(CF, {1,2}, true) };
-  ArgumentSpec with_const(true, stack);
-  REQUIRE(with_const.at(2).sizes().size() == 2);
 }
 
 void shapeAnalysisTest() {
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index a0e2f65e617754..aec6eb4ddc9447 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -38,9 +38,9 @@ void addInputs(Node *n, const char * name, const std::string& value)         { b
 void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { badArgType(); }
 
 void addInputs(Node *n, const char * name, at::TensorList value) {
-  Graph *g = n->owningGraph();
-  Node *list_node = g->appendNode(g->createList(DynamicType::get(), fmap(value, getValueTrace)));
-  n->addInput(list_node->output());
+  for (auto & t : value) {
+    n->addInput(getValueTrace(t));
+  }
 }
 
 void addInputs(Node *n, const char * name, at::IntList value) {
diff --git a/torch/csrc/jit/type.cpp b/torch/csrc/jit/type.cpp
index ddb4dfad0154ad..ebcc91a908c213 100644
--- a/torch/csrc/jit/type.cpp
+++ b/torch/csrc/jit/type.cpp
@@ -46,31 +46,31 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
   return out;
 }
 
-DynamicTypePtr DynamicType::get() {
+TypePtr DynamicType::get() {
   static auto value = DynamicType::create();
   return value;
 }
-NumberTypePtr NumberType::get() {
+TypePtr NumberType::get() {
   static auto value = NumberType::create();
   return value;
 }
-IntTypePtr IntType::get() {
+TypePtr IntType::get() {
   static auto value = IntType::create();
   return value;
 }
-FloatTypePtr FloatType::get() {
+TypePtr FloatType::get() {
   static auto value = FloatType::create();
   return value;
 }
-NoneTypePtr NoneType::get() {
+TypePtr NoneType::get() {
   static auto value = NoneType::create();
   return value;
 }
-ListTypePtr ListType::ofTensors() {
+TypePtr ListType::ofTensors() {
   static auto value = ListType::create(DynamicType::get());
   return value;
 }
-ListTypePtr ListType::ofInts() {
+TypePtr ListType::ofInts() {
   static auto value = ListType::create(IntType::get());
   return value;
 }
diff --git a/torch/csrc/jit/type.h b/torch/csrc/jit/type.h
index 713718e40681c8..7b7d708a549b32 100644
--- a/torch/csrc/jit/type.h
+++ b/torch/csrc/jit/type.h
@@ -80,7 +80,7 @@ struct TORCH_API Type : std::enable_shared_from_this<Type> {
     JIT_ASSERT(T::Kind == kind());
     return std::static_pointer_cast<const T>(shared_from_this());
   }
-  virtual ~Type() = default;
+  virtual ~Type() {}
 };
 
 inline bool operator!=(const Type & lhs, const Type & rhs) {
@@ -104,7 +104,7 @@ struct TORCH_API DynamicType : public Type {
   }
   static const TypeKind Kind = TypeKind::DynamicType;
   // global singleton
-  static DynamicTypePtr get();
+  static TypePtr get();
 private:
   DynamicType()
   : Type(TypeKind::DynamicType) {}
@@ -186,16 +186,16 @@ struct TORCH_API TensorType : public Type {
     : Type(TypeKind::TensorType)
     , scalar_type_(tensor.type().scalarType())
     , device_(tensor.type().is_cuda() ? tensor.get_device() : -1)
-    , sizes_(tensor.sizes().vec())
-    , strides_(tensor.strides().vec()) {}
+    , sizes_(tensor.sizes())
+    , strides_(tensor.strides()) {}
   TensorType(at::ScalarType scalar_type, int device, at::IntList sizes)
     : TensorType(scalar_type, device, sizes, TensorType::contiguousStridesOf(sizes)) {}
   TensorType(at::ScalarType scalar_type, int device, at::IntList sizes, at::IntList strides)
     : Type(TypeKind::TensorType)
     , scalar_type_(scalar_type)
     , device_(device)
-    , sizes_(sizes.vec())
-    , strides_(strides.vec())
+    , sizes_(sizes)
+    , strides_(strides)
     {}
   static std::vector<int64_t> contiguousStridesOf(at::IntList sizes) {
     std::vector<int64_t> strides(sizes.size());
@@ -237,8 +237,8 @@ struct TORCH_API ListType : public Type {
     return elem;
   }
   // common cast List[Tensor]
-  static ListTypePtr ofTensors();
-  static ListTypePtr ofInts();
+  static TypePtr ofTensors();
+  static TypePtr ofInts();
 private:
   ListType(TypePtr elem)
   : Type(TypeKind::ListType), elem(elem) {}
@@ -326,7 +326,7 @@ struct TORCH_API NumberType : public Type {
   }
   static const TypeKind Kind = TypeKind::NumberType;
   // global singleton
-  static NumberTypePtr get();
+  static TypePtr get();
 private:
   NumberType()
   : Type(TypeKind::NumberType) {}
@@ -351,7 +351,7 @@ struct TORCH_API FloatType : public Type {
   }
   static const TypeKind Kind = TypeKind::FloatType;
   // global singleton
-  static FloatTypePtr get();
+  static TypePtr get();
 private:
   FloatType()
   : Type(TypeKind::FloatType) {}
@@ -376,7 +376,7 @@ struct TORCH_API IntType : public Type {
   }
   static const TypeKind Kind = TypeKind::IntType;
   // global singleton
-  static IntTypePtr get();
+  static TypePtr get();
 private:
   IntType()
   : Type(TypeKind::IntType) {}
@@ -401,7 +401,7 @@ struct NoneType : public Type {
   }
   static const TypeKind Kind = TypeKind::NoneType;
   // global singleton
-  static NoneTypePtr get();
+  static TypePtr get();
 private:
   NoneType()
   : Type(TypeKind::NoneType) {}
diff --git a/torch/csrc/jit/variable_tensor_list.h b/torch/csrc/jit/variable_tensor_list.h
index 0916fe6ac051d2..eeae2a66b17e5f 100644
--- a/torch/csrc/jit/variable_tensor_list.h
+++ b/torch/csrc/jit/variable_tensor_list.h
@@ -6,10 +6,10 @@ namespace torch { namespace jit {
 // a wrapper to mark places where we expect all the at::Tensors to be
 // variables
 struct variable_tensor_list : public std::vector<at::Tensor> {
-  variable_tensor_list() = default;
+  variable_tensor_list() {}
   template<class InputIt>
   variable_tensor_list(InputIt first, InputIt last)
-  : std::vector<at::Tensor>(first, last) {}
+  : std::vector<at::Tensor>(first, last) {} 
   explicit variable_tensor_list(std::vector<at::Tensor> && tensor)
   : std::vector<at::Tensor>(std::move(tensor)) {}
 };
diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp
index 64747c8c4b83a9..b09824ec77b4a5 100644
--- a/torch/csrc/onnx/init.cpp
+++ b/torch/csrc/onnx/init.cpp
@@ -1,33 +1,36 @@
 #include "torch/csrc/onnx/init.h"
+#include "torch/csrc/onnx/onnx.npb.h"
 #include "torch/csrc/onnx/onnx.h"
-#include "onnx/onnx.pb.h"
 
 namespace torch { namespace onnx {
 void initONNXBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
   auto onnx = m.def_submodule("_onnx");
-  py::enum_<::ONNX_NAMESPACE::TensorProto_DataType>(onnx, "TensorProtoDataType")
-      .value("UNDEFINED", ::ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED)
-      .value("FLOAT", ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT)
-      .value("UINT8", ::ONNX_NAMESPACE::TensorProto_DataType_UINT8)
-      .value("INT8", ::ONNX_NAMESPACE::TensorProto_DataType_INT8)
-      .value("UINT16", ::ONNX_NAMESPACE::TensorProto_DataType_UINT16)
-      .value("INT16", ::ONNX_NAMESPACE::TensorProto_DataType_INT16)
-      .value("INT32", ::ONNX_NAMESPACE::TensorProto_DataType_INT32)
-      .value("INT64", ::ONNX_NAMESPACE::TensorProto_DataType_INT64)
-      .value("STRING", ::ONNX_NAMESPACE::TensorProto_DataType_STRING)
-      .value("BOOL", ::ONNX_NAMESPACE::TensorProto_DataType_BOOL)
-      .value("FLOAT16", ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT16)
-      .value("DOUBLE", ::ONNX_NAMESPACE::TensorProto_DataType_DOUBLE)
-      .value("UINT32", ::ONNX_NAMESPACE::TensorProto_DataType_UINT32)
-      .value("UINT64", ::ONNX_NAMESPACE::TensorProto_DataType_UINT64)
-      .value("COMPLEX64", ::ONNX_NAMESPACE::TensorProto_DataType_COMPLEX64)
-      .value("COMPLEX128", ::ONNX_NAMESPACE::TensorProto_DataType_COMPLEX128);
+  py::enum_<onnx_TensorProto_DataType>(onnx, "TensorProtoDataType")
+      .value("UNDEFINED", onnx_TensorProto_DataType_UNDEFINED)
+      .value("FLOAT", onnx_TensorProto_DataType_FLOAT)
+      .value("UINT8", onnx_TensorProto_DataType_UINT8)
+      .value("INT8", onnx_TensorProto_DataType_INT8)
+      .value("UINT16", onnx_TensorProto_DataType_UINT16)
+      .value("INT16", onnx_TensorProto_DataType_INT16)
+      .value("INT32", onnx_TensorProto_DataType_INT32)
+      .value("INT64", onnx_TensorProto_DataType_INT64)
+      .value("STRING", onnx_TensorProto_DataType_STRING)
+      .value("BOOL", onnx_TensorProto_DataType_BOOL)
+      .value("FLOAT16", onnx_TensorProto_DataType_FLOAT16)
+      .value("DOUBLE", onnx_TensorProto_DataType_DOUBLE)
+      .value("UINT32", onnx_TensorProto_DataType_UINT32)
+      .value("UINT64", onnx_TensorProto_DataType_UINT64)
+      .value("COMPLEX64", onnx_TensorProto_DataType_COMPLEX64)
+      .value("COMPLEX128", onnx_TensorProto_DataType_COMPLEX128);
 
   py::enum_<OperatorExportTypes>(onnx, "OperatorExportTypes")
     .value("ONNX", OperatorExportTypes::ONNX)
     .value("ONNX_ATEN", OperatorExportTypes::ONNX_ATEN)
     .value("ONNX_ATEN_FALLBACK", OperatorExportTypes::ONNX_ATEN_FALLBACK)
     .value("RAW", OperatorExportTypes::RAW);
+
+  py::class_<ModelProto>(onnx, "ModelProto")
+      .def("prettyPrint", &ModelProto::prettyPrint);
 }
 }} // namespace torch::onnx
diff --git a/torch/csrc/onnx/onnx.cpp b/torch/csrc/onnx/onnx.cpp
new file mode 100644
index 00000000000000..fa93f6866d5ed6
--- /dev/null
+++ b/torch/csrc/onnx/onnx.cpp
@@ -0,0 +1,214 @@
+#include "torch/csrc/onnx/onnx.h"
+
+namespace torch { namespace onnx {
+
+template <>
+bool micropb_encode<std::string, nullptr>(pb_ostream_t *stream, std::string* arg) {
+  return pb_encode_string(stream, reinterpret_cast<const pb_byte_t *>(arg->c_str()), arg->size());
+}
+// NB: Overloads don't work so great for signed variables.  Hope this doesn't
+// come up!
+template <>
+bool micropb_encode<int64_t, nullptr>(pb_ostream_t *stream, int64_t* arg) {
+  // Yes, this looks dodgy, and yes, this is what the docs say to do:
+  // https://jpa.kapsi.fi/nanopb/docs/reference.html#pb-encode-varint
+  return pb_encode_varint(stream, *reinterpret_cast<uint64_t*>(arg));
+}
+template <>
+bool micropb_encode<float, nullptr>(pb_ostream_t *stream, float* arg) {
+  return pb_encode_fixed32(stream, static_cast<void*>(arg));
+}
+template <>
+bool micropb_encode<double, nullptr>(pb_ostream_t *stream, double* arg) {
+  return pb_encode_fixed64(stream, static_cast<void*>(arg));
+}
+
+template <>
+bool micropb_encode<Dimension, nullptr>(pb_ostream_t *stream, Dimension* arg) {
+  return pb_encode_submessage(stream, onnx_TensorShapeProto_Dimension_fields,
+                              static_cast<void*>(arg));
+}
+
+// TODO: I'm not entirely sure why this can't be in the header...
+bool micropb_callback_string_from_tensor(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) {
+  at::Tensor* t = static_cast<at::Tensor*>(*arg);
+  AT_ASSERT(t->is_contiguous());
+  // Packed array format!
+  pb_encode_tag_for_field(stream, field);
+  pb_encode_string(stream, (pb_byte_t*)(t->data_ptr()),  t->type().elementSizeInBytes()*t->numel());
+
+  return true;
+}
+
+GraphProto* AttributeProto::add_graphs() {
+  auto ptr = new GraphProto();
+  graphs.emplace_back(ptr);
+  return ptr;
+}
+
+constexpr char indent_char = ' ';
+constexpr size_t indent_multiplier = 2;
+
+std::string idt(size_t indent) {
+  return std::string(indent * indent_multiplier, indent_char);
+}
+
+std::string nlidt(size_t indent) {
+  return std::string("\n") + idt(indent);
+}
+
+void TensorProto::dump(std::ostream& stream, size_t indent) {
+  stream << "TensorProto shape: [";
+  for (size_t i = 0; i < dims.size(); ++i) {
+    stream << *dims[i] << (i == dims.size() - 1 ? "" : " ");
+  }
+  stream << "]";
+}
+
+void TensorShapeProto::dump(std::ostream& stream, size_t indent) {
+  for (size_t i=0; i < dims.size(); ++i) {
+    auto &dim = dims[i];
+    if (dim->has_dim_value) {
+      stream << dim->dim_value;
+    } else {
+      stream << "?";
+    }
+    stream << (i == dims.size() - 1 ? "" : " ");
+  }
+}
+
+void TypeProtoTensor::dump(std::ostream& stream, size_t indent) {
+  stream << "Tensor dims: ";
+  shape->dump(stream);
+}
+
+void TypeProto::dump(std::ostream& stream, size_t indent) {
+  tensor_type->dump(stream);
+}
+
+void ValueInfoProto::dump(std::ostream& stream, size_t indent) {
+  stream << "{name: \"" << name
+         << "\", type:";
+  type->dump(stream);
+  stream << "}";
+}
+
+void AttributeProto::dump(std::ostream& stream, size_t indent) {
+  stream << "{ name: '" << name << "', type: ";
+  if (proto.has_f) {
+    stream << "float, value: " << proto.f;
+  } else if (proto.has_i) {
+    stream << "int, value: " << proto.i;
+  } else if (s.length()) {
+    stream << "string, value: '" << s << "'";
+  } else if (g) {
+    stream << "graph, value:\n";
+    g->dump(stream, indent+1);
+    stream << nlidt(indent);
+  } else if (t) {
+    stream << "tensor, value:";
+    t->dump(stream, indent+1);
+  } else if (floats.size()) {
+    stream << "floats, values: [";
+    for (size_t i=0; i < floats.size(); ++i)
+      stream << *floats[i] << (i == floats.size() - 1 ? "" : " ");
+    stream << "]";
+  } else if (ints.size()) {
+    stream << "ints, values: [";
+    for (size_t i=0; i < ints.size(); ++i)
+      stream << *ints[i] << (i == ints.size() - 1 ? "" : " ");
+    stream << "]";
+  } else if (strings.size()) {
+    stream << "strings, values: [";
+    for (size_t i=0; i < strings.size(); ++i)
+      stream << "'" << *strings[i] << "'" << (i == strings.size() - 1 ? "" : " ");
+    stream << "]";
+  } else if (tensors.size()) {
+    stream << "tensors, values: [";
+    for (auto& t : tensors) {
+      t->dump(stream, indent+1);
+    }
+    stream << "]";
+  } else if (graphs.size()) {
+    stream << "graphs, values: [";
+    for (auto& g : graphs) {
+      g->dump(stream, indent+1);
+    }
+    stream << "]";
+  } else {
+    stream << "UNKNOWN";
+  }
+  stream << "}";
+}
+
+void NodeProto::dump(std::ostream& stream, size_t indent) {
+  stream << "Node {type: \"" << op_type << "\", inputs: [";
+  for (size_t i=0; i < inputs.size(); ++i) {
+    stream << *inputs[i] << (i == inputs.size() - 1 ? "" : ",");
+  }
+  stream << "], outputs: [";
+  for (size_t i=0; i < outputs.size(); ++i) {
+    stream << *outputs[i] << (i == outputs.size() - 1 ? "" : ",");
+  }
+  stream << "], attributes: [";
+  for (size_t i=0; i < attributes.size(); ++i) {
+    attributes[i]->dump(stream, indent+1);
+    stream << (i == attributes.size() - 1 ? "" : ",");
+  }
+  stream << "]}";
+}
+
+void GraphProto::dump(std::ostream& stream, size_t indent) {
+  stream << idt(indent) << "GraphProto {" << nlidt(indent+1)
+         << "name: \"" << name << "\"" << nlidt(indent+1)
+         << "inputs: [";
+  for (size_t i=0; i < inputs.size(); ++i) {
+    inputs[i]->dump(stream, indent+2);
+    stream << (i == inputs.size() - 1 ? "" : ",");
+  }
+  stream << "]" << nlidt(indent+1)
+         << "outputs: [";
+  for (size_t i=0; i < outputs.size(); ++i) {
+    outputs[i]->dump(stream, indent+2);
+    stream << (i == outputs.size() - 1 ? "" : ",");
+  }
+  stream << "]" << nlidt(indent+1)
+         << "initializers: [";
+  for (size_t i=0; i < initializers.size(); ++i) {
+    initializers[i]->dump(stream, indent+2);
+    stream << (i == initializers.size() - 1 ? "" : ",");
+  }
+  stream << "]" << nlidt(indent+1)
+         << "nodes: [" << nlidt(indent+2);
+  for (size_t i=0; i < nodes.size(); ++i) {
+    nodes[i]->dump(stream, indent+2);
+    if (i != nodes.size() - 1) stream << "," << nlidt(indent+2);
+  }
+  stream << nlidt(indent+1) << "]\n" << idt(indent) << "}\n";
+}
+
+void OperatorSetIdProto::dump(std::ostream& stream, size_t indent) {
+  stream << "OperatorSetIdProto { domain: " << domain << "}";
+}
+
+void ModelProto::dump(std::ostream& stream, size_t indent) {
+  stream << idt(indent)
+         << "ModelProto {" << nlidt(indent+1)
+         << "producer_name: \"" << producer_name << "\"" << nlidt(indent+1)
+         << "domain: \"" << domain << "\"" << nlidt(indent+1)
+         << "doc_string: \"" << doc_string << "\"";
+  if (graph) {
+    stream << nlidt(indent+1) << "graph:\n";
+    graph->dump(stream, indent+2);
+  }
+  if (opset_import.size()) {
+    stream << idt(indent+1) << "opset_import: [";
+    for (auto &opset_imp : opset_import) {
+      opset_imp->dump(stream, indent+2);
+    }
+    stream << "],\n";
+  }
+  stream << idt(indent) << "}\n";
+}
+
+}} // namespace onnx
diff --git a/torch/csrc/onnx/onnx.h b/torch/csrc/onnx/onnx.h
index 76170e18110f1b..7fa38cc03898e9 100644
--- a/torch/csrc/onnx/onnx.h
+++ b/torch/csrc/onnx/onnx.h
@@ -1,11 +1,435 @@
 #pragma once
 
+#include "torch/csrc/onnx/onnx.npb.h"
+#include "torch/csrc/WindowsTorchApiMacro.h"
+
+#include <pb_encode.h>
+#include <ATen/ATen.h>
+
+#include <vector>
+#include <fstream>
+#include <memory>
+
 namespace torch { namespace onnx {
 
+using DataType = onnx_TensorProto_DataType;
+using Dimension = onnx_TensorShapeProto_Dimension;
+
+// Note [Unique vector]
+// ~~~~~~~~~~~~~~~~~~~~
+// Why do we need vectors of unique pointers?  A Google-style C++ Protobuf API
+// returns raw pointers T* which are expected to stay valid as long as the
+// enclosing protobuf is live.  However, if we store T directly in a vector, if
+// the vector ever resizes (which it may, because we don't know a priori how
+// many elements are in the vector) all of these pointers will be invalidated.
+// Thus, up-front, we have to give them permanent, dynamically allocated
+// addresses.
+template<typename T>
+using unique_vector = std::vector<std::unique_ptr<T>>;
+
+// Helper function for encoding inside callbacks
+template<typename T, const pb_field_t* Field>
+bool micropb_encode(pb_ostream_t *stream, T* arg) {
+  static_assert(Field != nullptr, "no overload in micropb_encode");
+  return pb_encode_submessage(stream, Field, static_cast<void*>(&arg->proto));
+}
+template <> bool micropb_encode<std::string, nullptr>(pb_ostream_t *stream, std::string* arg);
+template <> bool micropb_encode<int64_t, nullptr>(pb_ostream_t *stream, int64_t* arg);
+template <> bool micropb_encode<float, nullptr>(pb_ostream_t *stream, float* arg);
+template <> bool micropb_encode<double, nullptr>(pb_ostream_t *stream, double* arg);
+template <> bool micropb_encode<Dimension, nullptr>(pb_ostream_t *stream, Dimension* arg);
+// NB: If we ever add support for signed protobuf integers, we'll need a special
+// wrapper, since we can't overload over them (they look the same from C++ side)
+
+// Callback functions of type pb_callback_t.
+
+// Write out a single protobuf field inside a message
+template<typename T, const pb_field_t* Field>
+bool micropb_callback(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) {
+  if (!pb_encode_tag_for_field(stream, field)) return false;
+  if (!micropb_encode<T, Field>(stream, static_cast<T*>(*arg))) return false;
+  return true;
+}
+
+// Write out a repeated protobuf field inside a message
+template<typename T, const pb_field_t* Field>
+bool micropb_callback_list(pb_ostream_t *stream, const pb_field_t *field, void * const *arg) {
+  std::vector<std::unique_ptr<T>>* vals = static_cast<std::vector<std::unique_ptr<T>>*>(*arg);
+  for (std::unique_ptr<T>& val : *vals) {
+    auto ptr = static_cast<void*>(val.get());
+    if (!micropb_callback<T, Field>(stream, field, &ptr)) return false;
+  }
+  return true;
+}
+
+bool micropb_callback_string_from_tensor(pb_ostream_t *stream, const pb_field_t *field, void * const *arg);
+
+// MicroProto helper class
+template<typename T>
+struct MicroProto {
+  // The actual nanopb generated protobuf struct we are filling.
+  T proto;
+
+  // The constructor takes the protobuf struct by value for initialization
+  // (since it is a C-style struct).  In the constructor you're
+  // expected to call this with something like onnx_TensorProto_init_default
+  MicroProto(T proto) : proto(proto) {}
+
+  // Usage:
+  //    std::string owning_slot;
+  //    proto.string_field = string(&owning_slot, value_to_set)
+  //
+  // This function takes a string 's' and copies it into the
+  // owning slot specified by 'slot'.  It then returns a callback
+  // intended to be assigned into the particular protobuf field.
+  // The employed callback reads out the string from owning
+  // slot and writes it out to the protobuf.
+  //
+  // You should call this function IN THE SETTER METHOD, because
+  // the no-op callback is different from a callback with an empty
+  // string: in the former case, the field is absent; in the latter,
+  // the field is present but an empty string.
+  pb_callback_t string(std::string* slot, const std::string& s) {
+    *slot = s; // copy construct
+    pb_callback_t r;
+    r.funcs.encode = &micropb_callback<std::string, nullptr>;
+    r.arg = static_cast<void*>(slot);
+    return r; // RVO
+  }
+
+  // Usage:
+  //    at::Tensor owning_slot;
+  //    proto.string_field = string_from_tensor(&owning_slot, value_to_set)
+  //
+  // This function takes an at::Tensor and copies it into the
+  // owning slot specified by 'slot'.  It then returns a callback
+  // intended to be assigned into the particular protobuf field.
+  // The employed callback reads out the tensor's data as if it
+  // were a string (adjusting for endianness, if necessary)
+  // writes it out to the protobuf.
+  //
+  // You should call this function IN THE SETTER METHOD, because
+  // the no-op callback is different from a callback with an undefined
+  // Tensor.
+  pb_callback_t string_from_tensor(at::Tensor* slot, const at::Tensor& t) {
+    *slot = t; // copy construct
+    pb_callback_t r;
+    r.funcs.encode = &micropb_callback_string_from_tensor;
+    r.arg = static_cast<void*>(slot);
+    return r; // RVO
+  }
+
+  // Usage:
+  //    unique_vector<ElemType> owning_slot;
+  //    proto.list_field = list<ElemType>(&owning_slot)
+  //
+  // This function returns a callback intended to be
+  // assigned into a particular protobuf field.  The employed
+  // callback reads out the vector of elements from the owning
+  // slot and writes the entries into the protobuf.
+  //
+  // You should call this function IN THE CONSTRUCTOR, because
+  // the no-op callback is equivalent to a callback with an empty
+  // list.  (While it's harmless to call this in the setter, but
+  // a bit wasteful.)
+  template<typename S, const pb_field_t* Field = nullptr>
+  pb_callback_t list(unique_vector<S>* slot) {
+    pb_callback_t r;
+    r.funcs.encode = &micropb_callback_list<S, Field>;
+    r.arg = static_cast<void*>(slot);
+    return r; // RVO
+  }
+
+  template<typename S, const pb_field_t* Field = nullptr>
+  pb_callback_t msg(std::unique_ptr<S>* slot) {
+    *slot = std::unique_ptr<S>(new S()); // default construct
+    pb_callback_t r;
+    r.funcs.encode = &micropb_callback<S, Field>;
+    r.arg = static_cast<void*>(slot->get());
+    return r; // RVO
+  }
+};
+
+#define DEFINE_CONST(C) \
+const auto k##C = onnx_TensorProto_DataType_##C;
+DEFINE_CONST(FLOAT)
+DEFINE_CONST(UINT8)
+DEFINE_CONST(INT8)
+DEFINE_CONST(UINT16)
+DEFINE_CONST(INT16)
+DEFINE_CONST(INT32)
+DEFINE_CONST(INT64)
+DEFINE_CONST(STRING)
+DEFINE_CONST(BOOL)
+DEFINE_CONST(FLOAT16)
+DEFINE_CONST(DOUBLE)
+DEFINE_CONST(UINT32)
+DEFINE_CONST(UINT64)
+DEFINE_CONST(COMPLEX64)
+DEFINE_CONST(COMPLEX128)
+#undef DEFINE_CONST
+
+#define DEFINE_CONST(C) \
+const auto a##C = onnx_AttributeProto_AttributeType_##C;
+DEFINE_CONST(FLOAT)
+DEFINE_CONST(INT)
+DEFINE_CONST(STRING)
+DEFINE_CONST(TENSOR)
+DEFINE_CONST(GRAPH)
+DEFINE_CONST(FLOATS)
+DEFINE_CONST(INTS)
+DEFINE_CONST(STRINGS)
+DEFINE_CONST(TENSORS)
+DEFINE_CONST(GRAPHS)
+#undef DEFINE_CONST
+
+// C++ wrappers which simulate the Google C++ Protobuf API
+//
+// These are NOT COMPLETE wrappers. If you find something is missing, add it!
+
+class AttributeProto;
+class TensorShapeProto;
+class TypeProtoTensor;
+class TensorProto;
+class TypeProto;
+class ValueInfoProto;
+class NodeProto;
+class GraphProto;
+class ModelProto;
+
+class TensorProto : public MicroProto<onnx_TensorProto> {
+private:
+  std::string name; // namespace ValueInfoProto.
+  unique_vector<int64_t> dims;
+  at::Tensor raw_data;
+  std::string dump_;
+public:
+  TensorProto() : MicroProto(onnx_TensorProto_init_default) {
+    proto.dims       = list<int64_t>(&dims);
+  }
+  void set_name(const std::string& s) { proto.name = string(&name, s); }
+  void add_dims(int64_t d) { dims.emplace_back(new int64_t(d)); }
+  // Google Protobuf divergence!
+  void set_raw_data(const at::Tensor& t) { proto.raw_data = string_from_tensor(&raw_data, t); }
+  void set_external_data_present() { proto.raw_data = string(&dump_, "__EXTERNAL"); }
+  void set_data_type(onnx_TensorProto_DataType t) { proto.has_data_type = true; proto.data_type = t; }
+  std::string get_name() const { return name; }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class TensorShapeProto : public MicroProto<onnx_TensorShapeProto> {
+private:
+  unique_vector<Dimension> dims;
+public:
+  TensorShapeProto() : MicroProto(onnx_TensorShapeProto_init_default) {
+    proto.dim = list<Dimension>(&dims);
+  }
+  void add_dim(std::int64_t d) {
+    Dimension* p_d = new Dimension();
+    p_d->has_dim_value = true;
+    p_d->dim_value = d;
+    dims.emplace_back(p_d);
+  }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class TypeProtoTensor : public MicroProto<onnx_TypeProto_Tensor> {
+private:
+  std::unique_ptr<TensorShapeProto> shape;
+public:
+  TypeProtoTensor() : MicroProto(onnx_TypeProto_Tensor_init_default) {}
+  void set_data_type(onnx_TensorProto_DataType t) { proto.has_elem_type = true; proto.elem_type = t; }
+  TensorShapeProto* mutable_shape() {
+    proto.shape = msg<TensorShapeProto, onnx_TensorShapeProto_fields>(&shape);
+    return shape.get();
+  }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class TypeProto : public MicroProto<onnx_TypeProto> {
+private:
+  std::unique_ptr<TypeProtoTensor> tensor_type;
+public:
+  TypeProto() : MicroProto(onnx_TypeProto_init_default) {}
+  TypeProtoTensor* mutable_tensor_type() {
+    proto.tensor_type = msg<TypeProtoTensor, onnx_TypeProto_Tensor_fields>(&tensor_type);
+    return tensor_type.get();
+  }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class ValueInfoProto : public MicroProto<onnx_ValueInfoProto> {
+private:
+  std::string name;
+  std::unique_ptr<TypeProto> type;
+public:
+  ValueInfoProto() : MicroProto(onnx_ValueInfoProto_init_default) {}
+  std::string get_name() { return name; }
+  void set_name(const std::string& s) { proto.name = string(&name, s); }
+  TypeProto* mutable_type() {
+    proto.type = msg<TypeProto, onnx_TypeProto_fields>(&type);
+    return type.get();
+  }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class AttributeProto : public MicroProto<onnx_AttributeProto> {
+private:
+  std::string name;
+  std::string s;
+  std::unique_ptr<GraphProto> g;
+  std::unique_ptr<TensorProto> t;
+  unique_vector<float> floats;
+  unique_vector<int64_t> ints;
+  unique_vector<std::string> strings;
+  unique_vector<TensorProto> tensors;
+  unique_vector<GraphProto> graphs;
+public:
+  AttributeProto() : MicroProto(onnx_AttributeProto_init_default) {
+    proto.floats  = list<float>(&floats);
+    proto.ints    = list<int64_t>(&ints);
+    proto.strings = list<std::string>(&strings);
+    proto.tensors = list<TensorProto, onnx_TensorProto_fields>(&tensors);
+    proto.graphs  = list<GraphProto, onnx_GraphProto_fields>(&graphs);
+  }
+  void set_name(const std::string& s) { proto.name = string(&name, s); }
+  void set_type(onnx_AttributeProto_AttributeType t) { proto.has_type = true; proto.type = t; }
+  void set_f(float f) { proto.has_f = true; proto.f = f; }
+  void set_i(int64_t i) { proto.has_i = true; proto.i = i; }
+  void set_s(std::string s_) { proto.s = string(&s, s_); }
+  // See https://developers.google.com/protocol-buffers/docs/reference/cpp-generated#embeddedmessage
+  GraphProto* mutable_g() { proto.g = msg<GraphProto, onnx_GraphProto_fields>(&g); return g.get(); }
+  TensorProto* mutable_t() { proto.t = msg<TensorProto, onnx_TensorProto_fields>(&t); return t.get(); }
+  void add_floats(float f) { floats.emplace_back(new float(f)); }
+  void add_ints(int64_t i) { ints.emplace_back(new int64_t(i)); }
+  void add_strings(std::string s) { strings.emplace_back(new std::string(s)); }
+  TensorProto* add_tensors() {
+    auto ptr = new TensorProto();
+    tensors.emplace_back(ptr);
+    return ptr;
+  }
+  GraphProto* add_graphs();
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class NodeProto : public MicroProto<onnx_NodeProto> {
+private:
+  std::string op_type;
+  std::string domain;
+  std::string doc_string;
+  unique_vector<std::string> inputs;
+  unique_vector<std::string> outputs;
+  unique_vector<AttributeProto> attributes;
+public:
+  NodeProto() : MicroProto(onnx_NodeProto_init_default) {
+    proto.input = list<std::string>(&inputs);
+    proto.output = list<std::string>(&outputs);
+    proto.attribute = list<AttributeProto, onnx_AttributeProto_fields>(&attributes);
+  }
+  void add_input(const std::string& s) { inputs.emplace_back(new std::string(s)); }
+  void clear_input() { inputs.clear(); }
+  void add_output(const std::string& s) { outputs.emplace_back(new std::string(s)); }
+  void clear_output() { outputs.clear(); }
+  AttributeProto* add_attribute() {
+    auto ptr = new AttributeProto();
+    attributes.emplace_back(ptr);
+    return ptr;
+  }
+  void set_op_type(const std::string& s) { proto.op_type = string(&op_type, s); }
+  void set_domain(const std::string& s) { proto.domain = string(&domain, s); }
+  void set_doc_string(const std::string& s) { proto.doc_string = string(&doc_string, s); }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class GraphProto : public MicroProto<onnx_GraphProto> {
+private:
+  std::string name;
+  unique_vector<ValueInfoProto> inputs;
+  unique_vector<ValueInfoProto> outputs;
+  unique_vector<NodeProto> nodes;
+  unique_vector<TensorProto> initializers;
+public:
+  GraphProto() : MicroProto(onnx_GraphProto_init_default) {
+    proto.input = list<ValueInfoProto, onnx_ValueInfoProto_fields>(&inputs);
+    proto.output = list<ValueInfoProto, onnx_ValueInfoProto_fields>(&outputs);
+    proto.node = list<NodeProto, onnx_NodeProto_fields>(&nodes);
+    proto.initializer = list<TensorProto, onnx_TensorProto_fields>(&initializers);
+  }
+  void set_name(const std::string& s) { proto.name = string(&name, s); }
+  ValueInfoProto* add_input() {
+    auto ptr = new ValueInfoProto();
+    inputs.emplace_back(ptr);
+    return ptr;
+  }
+  std::string get_input_name(size_t i) { return inputs.at(i)->get_name(); }
+  ValueInfoProto* add_output() {
+    auto ptr = new ValueInfoProto();
+    outputs.emplace_back(ptr);
+    return ptr;
+  }
+  NodeProto* add_node() {
+    auto ptr = new NodeProto();
+    nodes.emplace_back(ptr);
+    return ptr;
+  }
+  TensorProto* add_initializer() {
+    auto ptr = new TensorProto();
+    initializers.emplace_back(ptr);
+    return ptr;
+  }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class OperatorSetIdProto : public MicroProto<onnx_OperatorSetIdProto> {
+private:
+  std::string domain;
+public:
+  OperatorSetIdProto() : MicroProto(onnx_OperatorSetIdProto_init_default) {}
+  void set_domain(const std::string& s) { proto.domain = string(&domain, s); }
+  void set_version(int64_t v) { proto.has_version = true; proto.version = v; }
+  void dump(std::ostream& stream, size_t indent = 0);
+};
+
+class ModelProto : public MicroProto<onnx_ModelProto> {
+private:
+  std::string producer_name;
+  std::string producer_version;
+  std::string domain;
+  std::string doc_string;
+  std::unique_ptr<GraphProto> graph;
+  unique_vector<OperatorSetIdProto> opset_import;
+public:
+  ModelProto() : MicroProto(onnx_ModelProto_init_default) {
+    proto.has_ir_version = true;
+    proto.ir_version = onnx_Version_IR_VERSION;
+    proto.opset_import = list<OperatorSetIdProto, onnx_OperatorSetIdProto_fields>(&opset_import);
+  }
+  void set_model_version(int64_t i) { proto.has_model_version = true; proto.model_version = i; }
+  void set_doc_string(const std::string& s) { proto.doc_string = string(&doc_string, s); }
+  void set_producer_name(const std::string& s) { proto.producer_name = string(&producer_name, s); }
+  void set_producer_version(const std::string& s) { proto.producer_version = string(&producer_version, s); }
+  GraphProto* mutable_graph() {
+    proto.graph = msg<GraphProto, onnx_GraphProto_fields>(&graph);
+    return graph.get();
+  }
+  OperatorSetIdProto* add_opset_import() {
+    auto ptr = new OperatorSetIdProto();
+    opset_import.emplace_back(ptr);
+    return ptr;
+  }
+  TORCH_API void dump(std::ostream& stream, size_t indent = 0);
+  std::string prettyPrint() {
+    std::stringstream ss;
+    dump(ss, 0);
+    return ss.str();
+  }
+};
+
 enum class OperatorExportTypes {
   ONNX, // Strict ONNX export
   ONNX_ATEN, // ONNX With ATen op everywhere
   ONNX_ATEN_FALLBACK, // ONNX export with ATen fallback
   RAW, // Raw export (no ONNX)
 };
+
 }} // namespace torch::onnx
diff --git a/torch/csrc/onnx/onnx.npb.cpp b/torch/csrc/onnx/onnx.npb.cpp
new file mode 100644
index 00000000000000..2d8ee60eaff414
--- /dev/null
+++ b/torch/csrc/onnx/onnx.npb.cpp
@@ -0,0 +1,162 @@
+/* Automatically generated nanopb constant definitions */
+/* Generated by nanopb-0.3.9-dev */
+
+#include "onnx.npb.h"
+
+/* @@protoc_insertion_point(includes) */
+#if PB_PROTO_HEADER_VERSION != 30
+#error Regenerate this file with the current version of nanopb generator.
+#endif
+
+
+
+const pb_field_t onnx_AttributeProto_fields[14] = {
+    PB_FIELD(  1, STRING  , OPTIONAL, CALLBACK, FIRST, onnx_AttributeProto, name, name, 0),
+    PB_FIELD(  2, FLOAT   , OPTIONAL, STATIC  , OTHER, onnx_AttributeProto, f, name, 0),
+    PB_FIELD(  3, INT64   , OPTIONAL, STATIC  , OTHER, onnx_AttributeProto, i, f, 0),
+    PB_FIELD(  4, BYTES   , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, s, i, 0),
+    PB_FIELD(  5, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, t, s, &onnx_TensorProto_fields),
+    PB_FIELD(  6, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, g, t, &onnx_GraphProto_fields),
+    PB_FIELD(  7, FLOAT   , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, floats, g, 0),
+    PB_FIELD(  8, INT64   , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, ints, floats, 0),
+    PB_FIELD(  9, BYTES   , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, strings, ints, 0),
+    PB_FIELD( 10, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, tensors, strings, &onnx_TensorProto_fields),
+    PB_FIELD( 11, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_AttributeProto, graphs, tensors, &onnx_GraphProto_fields),
+    PB_FIELD( 13, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_AttributeProto, doc_string, graphs, 0),
+    PB_FIELD( 20, UENUM   , OPTIONAL, STATIC  , OTHER, onnx_AttributeProto, type, doc_string, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_ValueInfoProto_fields[4] = {
+    PB_FIELD(  1, STRING  , OPTIONAL, CALLBACK, FIRST, onnx_ValueInfoProto, name, name, 0),
+    PB_FIELD(  2, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_ValueInfoProto, type, name, &onnx_TypeProto_fields),
+    PB_FIELD(  3, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ValueInfoProto, doc_string, type, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_NodeProto_fields[8] = {
+    PB_FIELD(  1, STRING  , REPEATED, CALLBACK, FIRST, onnx_NodeProto, input, input, 0),
+    PB_FIELD(  2, STRING  , REPEATED, CALLBACK, OTHER, onnx_NodeProto, output, input, 0),
+    PB_FIELD(  3, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, name, output, 0),
+    PB_FIELD(  4, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, op_type, name, 0),
+    PB_FIELD(  5, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_NodeProto, attribute, op_type, &onnx_AttributeProto_fields),
+    PB_FIELD(  6, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, doc_string, attribute, 0),
+    PB_FIELD(  7, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_NodeProto, domain, doc_string, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_ModelProto_fields[10] = {
+    PB_FIELD(  1, INT64   , OPTIONAL, STATIC  , FIRST, onnx_ModelProto, ir_version, ir_version, 0),
+    PB_FIELD(  2, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, producer_name, ir_version, 0),
+    PB_FIELD(  3, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, producer_version, producer_name, 0),
+    PB_FIELD(  4, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, domain, producer_version, 0),
+    PB_FIELD(  5, INT64   , OPTIONAL, STATIC  , OTHER, onnx_ModelProto, model_version, domain, 0),
+    PB_FIELD(  6, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, doc_string, model_version, 0),
+    PB_FIELD(  7, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_ModelProto, graph, doc_string, &onnx_GraphProto_fields),
+    PB_FIELD(  8, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_ModelProto, opset_import, graph, &onnx_OperatorSetIdProto_fields),
+    PB_FIELD( 14, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_ModelProto, metadata_props, opset_import, &onnx_StringStringEntryProto_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_StringStringEntryProto_fields[3] = {
+    PB_FIELD(  1, STRING  , OPTIONAL, CALLBACK, FIRST, onnx_StringStringEntryProto, key, key, 0),
+    PB_FIELD(  2, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_StringStringEntryProto, value, key, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_GraphProto_fields[8] = {
+    PB_FIELD(  1, MESSAGE , REPEATED, CALLBACK, FIRST, onnx_GraphProto, node, node, &onnx_NodeProto_fields),
+    PB_FIELD(  2, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_GraphProto, name, node, 0),
+    PB_FIELD(  5, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, initializer, name, &onnx_TensorProto_fields),
+    PB_FIELD( 10, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_GraphProto, doc_string, initializer, 0),
+    PB_FIELD( 11, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, input, doc_string, &onnx_ValueInfoProto_fields),
+    PB_FIELD( 12, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, output, input, &onnx_ValueInfoProto_fields),
+    PB_FIELD( 13, MESSAGE , REPEATED, CALLBACK, OTHER, onnx_GraphProto, value_info, output, &onnx_ValueInfoProto_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_TensorProto_fields[13] = {
+    PB_FIELD(  1, INT64   , REPEATED, CALLBACK, FIRST, onnx_TensorProto, dims, dims, 0),
+    PB_FIELD(  2, UENUM   , OPTIONAL, STATIC  , OTHER, onnx_TensorProto, data_type, dims, 0),
+    PB_FIELD(  3, MESSAGE , OPTIONAL, STATIC  , OTHER, onnx_TensorProto, segment, data_type, &onnx_TensorProto_Segment_fields),
+    PB_FIELD(  4, FLOAT   , REPEATED, CALLBACK, OTHER, onnx_TensorProto, float_data, segment, 0),
+    PB_FIELD(  5, INT32   , REPEATED, CALLBACK, OTHER, onnx_TensorProto, int32_data, float_data, 0),
+    PB_FIELD(  6, BYTES   , REPEATED, CALLBACK, OTHER, onnx_TensorProto, string_data, int32_data, 0),
+    PB_FIELD(  7, INT64   , REPEATED, CALLBACK, OTHER, onnx_TensorProto, int64_data, string_data, 0),
+    PB_FIELD(  8, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, name, int64_data, 0),
+    PB_FIELD(  9, BYTES   , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, raw_data, name, 0),
+    PB_FIELD( 10, DOUBLE  , REPEATED, CALLBACK, OTHER, onnx_TensorProto, double_data, raw_data, 0),
+    PB_FIELD( 11, UINT64  , REPEATED, CALLBACK, OTHER, onnx_TensorProto, uint64_data, double_data, 0),
+    PB_FIELD( 12, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_TensorProto, doc_string, uint64_data, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_TensorProto_Segment_fields[3] = {
+    PB_FIELD(  1, INT64   , OPTIONAL, STATIC  , FIRST, onnx_TensorProto_Segment, begin, begin, 0),
+    PB_FIELD(  2, INT64   , OPTIONAL, STATIC  , OTHER, onnx_TensorProto_Segment, end, begin, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_TensorShapeProto_fields[2] = {
+    PB_FIELD(  1, MESSAGE , REPEATED, CALLBACK, FIRST, onnx_TensorShapeProto, dim, dim, &onnx_TensorShapeProto_Dimension_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_TensorShapeProto_Dimension_fields[3] = {
+    PB_FIELD(  1, INT64   , OPTIONAL, STATIC  , FIRST, onnx_TensorShapeProto_Dimension, dim_value, dim_value, 0),
+    PB_FIELD(  2, STRING  , OPTIONAL, CALLBACK, OTHER, onnx_TensorShapeProto_Dimension, dim_param, dim_value, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_TypeProto_fields[2] = {
+    PB_FIELD(  1, MESSAGE , OPTIONAL, CALLBACK, FIRST, onnx_TypeProto, tensor_type, tensor_type, &onnx_TypeProto_Tensor_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_TypeProto_Tensor_fields[3] = {
+    PB_FIELD(  1, UENUM   , OPTIONAL, STATIC  , FIRST, onnx_TypeProto_Tensor, elem_type, elem_type, 0),
+    PB_FIELD(  2, MESSAGE , OPTIONAL, CALLBACK, OTHER, onnx_TypeProto_Tensor, shape, elem_type, &onnx_TensorShapeProto_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t onnx_OperatorSetIdProto_fields[3] = {
+    PB_FIELD(  1, STRING  , OPTIONAL, CALLBACK, FIRST, onnx_OperatorSetIdProto, domain, domain, 0),
+    PB_FIELD(  2, INT64   , OPTIONAL, STATIC  , OTHER, onnx_OperatorSetIdProto, version, domain, 0),
+    PB_LAST_FIELD
+};
+
+
+
+
+
+/* Check that field information fits in pb_field_t */
+#if !defined(PB_FIELD_32BIT)
+/* If you get an error here, it means that you need to define PB_FIELD_32BIT
+ * compile-time option. You can do that in pb.h or on compiler command line.
+ *
+ * The reason you need to do this is that some of your messages contain tag
+ * numbers or field sizes that are larger than what can fit in 8 or 16 bit
+ * field descriptors.
+ */
+PB_STATIC_ASSERT((pb_membersize(onnx_TensorProto, segment) < 65536), YOU_MUST_DEFINE_PB_FIELD_32BIT_FOR_MESSAGES_onnx_AttributeProto_onnx_ValueInfoProto_onnx_NodeProto_onnx_ModelProto_onnx_StringStringEntryProto_onnx_GraphProto_onnx_TensorProto_onnx_TensorProto_Segment_onnx_TensorShapeProto_onnx_TensorShapeProto_Dimension_onnx_TypeProto_onnx_TypeProto_Tensor_onnx_OperatorSetIdProto)
+#endif
+
+#if !defined(PB_FIELD_16BIT) && !defined(PB_FIELD_32BIT)
+/* If you get an error here, it means that you need to define PB_FIELD_16BIT
+ * compile-time option. You can do that in pb.h or on compiler command line.
+ *
+ * The reason you need to do this is that some of your messages contain tag
+ * numbers or field sizes that are larger than what can fit in the default
+ * 8 bit descriptors.
+ */
+PB_STATIC_ASSERT((pb_membersize(onnx_TensorProto, segment) < 256), YOU_MUST_DEFINE_PB_FIELD_16BIT_FOR_MESSAGES_onnx_AttributeProto_onnx_ValueInfoProto_onnx_NodeProto_onnx_ModelProto_onnx_StringStringEntryProto_onnx_GraphProto_onnx_TensorProto_onnx_TensorProto_Segment_onnx_TensorShapeProto_onnx_TensorShapeProto_Dimension_onnx_TypeProto_onnx_TypeProto_Tensor_onnx_OperatorSetIdProto)
+#endif
+
+
+/* On some platforms (such as AVR), double is really float.
+ * These are not directly supported by nanopb, but see example_avr_double.
+ * To get rid of this error, remove any double fields from your .proto.
+ */
+PB_STATIC_ASSERT(sizeof(double) == 8, DOUBLE_MUST_BE_8_BYTES)
+
+/* @@protoc_insertion_point(eof) */
diff --git a/torch/csrc/onnx/onnx.npb.h b/torch/csrc/onnx/onnx.npb.h
new file mode 100644
index 00000000000000..84d3b318643830
--- /dev/null
+++ b/torch/csrc/onnx/onnx.npb.h
@@ -0,0 +1,333 @@
+/* Automatically generated nanopb header */
+/* Generated by nanopb-0.3.9-dev */
+
+#ifndef PB_ONNX_ONNX_PB_H_INCLUDED
+#define PB_ONNX_ONNX_PB_H_INCLUDED
+#include <pb.h>
+
+/* @@protoc_insertion_point(includes) */
+#if PB_PROTO_HEADER_VERSION != 30
+#error Regenerate this file with the current version of nanopb generator.
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Enum definitions */
+typedef enum _onnx_Version {
+    onnx_Version__START_VERSION = 0,
+    onnx_Version_IR_VERSION_2017_10_10 = 1,
+    onnx_Version_IR_VERSION_2017_10_30 = 2,
+    onnx_Version_IR_VERSION = 3
+} onnx_Version;
+#define _onnx_Version_MIN onnx_Version__START_VERSION
+#define _onnx_Version_MAX onnx_Version_IR_VERSION
+#define _onnx_Version_ARRAYSIZE ((onnx_Version)(onnx_Version_IR_VERSION+1))
+
+typedef enum _onnx_AttributeProto_AttributeType {
+    onnx_AttributeProto_AttributeType_UNDEFINED = 0,
+    onnx_AttributeProto_AttributeType_FLOAT = 1,
+    onnx_AttributeProto_AttributeType_INT = 2,
+    onnx_AttributeProto_AttributeType_STRING = 3,
+    onnx_AttributeProto_AttributeType_TENSOR = 4,
+    onnx_AttributeProto_AttributeType_GRAPH = 5,
+    onnx_AttributeProto_AttributeType_FLOATS = 6,
+    onnx_AttributeProto_AttributeType_INTS = 7,
+    onnx_AttributeProto_AttributeType_STRINGS = 8,
+    onnx_AttributeProto_AttributeType_TENSORS = 9,
+    onnx_AttributeProto_AttributeType_GRAPHS = 10
+} onnx_AttributeProto_AttributeType;
+#define _onnx_AttributeProto_AttributeType_MIN onnx_AttributeProto_AttributeType_UNDEFINED
+#define _onnx_AttributeProto_AttributeType_MAX onnx_AttributeProto_AttributeType_GRAPHS
+#define _onnx_AttributeProto_AttributeType_ARRAYSIZE ((onnx_AttributeProto_AttributeType)(onnx_AttributeProto_AttributeType_GRAPHS+1))
+
+typedef enum _onnx_TensorProto_DataType {
+    onnx_TensorProto_DataType_UNDEFINED = 0,
+    onnx_TensorProto_DataType_FLOAT = 1,
+    onnx_TensorProto_DataType_UINT8 = 2,
+    onnx_TensorProto_DataType_INT8 = 3,
+    onnx_TensorProto_DataType_UINT16 = 4,
+    onnx_TensorProto_DataType_INT16 = 5,
+    onnx_TensorProto_DataType_INT32 = 6,
+    onnx_TensorProto_DataType_INT64 = 7,
+    onnx_TensorProto_DataType_STRING = 8,
+    onnx_TensorProto_DataType_BOOL = 9,
+    onnx_TensorProto_DataType_FLOAT16 = 10,
+    onnx_TensorProto_DataType_DOUBLE = 11,
+    onnx_TensorProto_DataType_UINT32 = 12,
+    onnx_TensorProto_DataType_UINT64 = 13,
+    onnx_TensorProto_DataType_COMPLEX64 = 14,
+    onnx_TensorProto_DataType_COMPLEX128 = 15
+} onnx_TensorProto_DataType;
+#define _onnx_TensorProto_DataType_MIN onnx_TensorProto_DataType_UNDEFINED
+#define _onnx_TensorProto_DataType_MAX onnx_TensorProto_DataType_COMPLEX128
+#define _onnx_TensorProto_DataType_ARRAYSIZE ((onnx_TensorProto_DataType)(onnx_TensorProto_DataType_COMPLEX128+1))
+
+/* Struct definitions */
+typedef struct _onnx_GraphProto {
+    pb_callback_t node;
+    pb_callback_t name;
+    pb_callback_t initializer;
+    pb_callback_t doc_string;
+    pb_callback_t input;
+    pb_callback_t output;
+    pb_callback_t value_info;
+/* @@protoc_insertion_point(struct:onnx_GraphProto) */
+} onnx_GraphProto;
+
+typedef struct _onnx_NodeProto {
+    pb_callback_t input;
+    pb_callback_t output;
+    pb_callback_t name;
+    pb_callback_t op_type;
+    pb_callback_t attribute;
+    pb_callback_t doc_string;
+    pb_callback_t domain;
+/* @@protoc_insertion_point(struct:onnx_NodeProto) */
+} onnx_NodeProto;
+
+typedef struct _onnx_StringStringEntryProto {
+    pb_callback_t key;
+    pb_callback_t value;
+/* @@protoc_insertion_point(struct:onnx_StringStringEntryProto) */
+} onnx_StringStringEntryProto;
+
+typedef struct _onnx_TensorShapeProto {
+    pb_callback_t dim;
+/* @@protoc_insertion_point(struct:onnx_TensorShapeProto) */
+} onnx_TensorShapeProto;
+
+typedef struct _onnx_TypeProto {
+    pb_callback_t tensor_type;
+/* @@protoc_insertion_point(struct:onnx_TypeProto) */
+} onnx_TypeProto;
+
+typedef struct _onnx_ValueInfoProto {
+    pb_callback_t name;
+    pb_callback_t type;
+    pb_callback_t doc_string;
+/* @@protoc_insertion_point(struct:onnx_ValueInfoProto) */
+} onnx_ValueInfoProto;
+
+typedef struct _onnx_AttributeProto {
+    pb_callback_t name;
+    bool has_f;
+    float f;
+    bool has_i;
+    int64_t i;
+    pb_callback_t s;
+    pb_callback_t t;
+    pb_callback_t g;
+    pb_callback_t floats;
+    pb_callback_t ints;
+    pb_callback_t strings;
+    pb_callback_t tensors;
+    pb_callback_t graphs;
+    pb_callback_t doc_string;
+    bool has_type;
+    onnx_AttributeProto_AttributeType type;
+/* @@protoc_insertion_point(struct:onnx_AttributeProto) */
+} onnx_AttributeProto;
+
+typedef struct _onnx_ModelProto {
+    bool has_ir_version;
+    int64_t ir_version;
+    pb_callback_t producer_name;
+    pb_callback_t producer_version;
+    pb_callback_t domain;
+    bool has_model_version;
+    int64_t model_version;
+    pb_callback_t doc_string;
+    pb_callback_t graph;
+    pb_callback_t opset_import;
+    pb_callback_t metadata_props;
+/* @@protoc_insertion_point(struct:onnx_ModelProto) */
+} onnx_ModelProto;
+
+typedef struct _onnx_OperatorSetIdProto {
+    pb_callback_t domain;
+    bool has_version;
+    int64_t version;
+/* @@protoc_insertion_point(struct:onnx_OperatorSetIdProto) */
+} onnx_OperatorSetIdProto;
+
+typedef struct _onnx_TensorProto_Segment {
+    bool has_begin;
+    int64_t begin;
+    bool has_end;
+    int64_t end;
+/* @@protoc_insertion_point(struct:onnx_TensorProto_Segment) */
+} onnx_TensorProto_Segment;
+
+typedef struct _onnx_TensorShapeProto_Dimension {
+    bool has_dim_value;
+    int64_t dim_value;
+    pb_callback_t dim_param;
+/* @@protoc_insertion_point(struct:onnx_TensorShapeProto_Dimension) */
+} onnx_TensorShapeProto_Dimension;
+
+typedef struct _onnx_TypeProto_Tensor {
+    bool has_elem_type;
+    onnx_TensorProto_DataType elem_type;
+    pb_callback_t shape;
+/* @@protoc_insertion_point(struct:onnx_TypeProto_Tensor) */
+} onnx_TypeProto_Tensor;
+
+typedef struct _onnx_TensorProto {
+    pb_callback_t dims;
+    bool has_data_type;
+    onnx_TensorProto_DataType data_type;
+    bool has_segment;
+    onnx_TensorProto_Segment segment;
+    pb_callback_t float_data;
+    pb_callback_t int32_data;
+    pb_callback_t string_data;
+    pb_callback_t int64_data;
+    pb_callback_t name;
+    pb_callback_t raw_data;
+    pb_callback_t double_data;
+    pb_callback_t uint64_data;
+    pb_callback_t doc_string;
+/* @@protoc_insertion_point(struct:onnx_TensorProto) */
+} onnx_TensorProto;
+
+/* Default values for struct fields */
+
+/* Initializer values for message structs */
+#define onnx_AttributeProto_init_default         {{{NULL}, NULL}, false, 0, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, (onnx_AttributeProto_AttributeType)0}
+#define onnx_ValueInfoProto_init_default         {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_NodeProto_init_default              {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_ModelProto_init_default             {false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_StringStringEntryProto_init_default {{{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_GraphProto_init_default             {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_TensorProto_init_default            {{{NULL}, NULL}, false, (onnx_TensorProto_DataType)0, false, onnx_TensorProto_Segment_init_default, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_TensorProto_Segment_init_default    {false, 0, false, 0}
+#define onnx_TensorShapeProto_init_default       {{{NULL}, NULL}}
+#define onnx_TensorShapeProto_Dimension_init_default {false, 0, {{NULL}, NULL}}
+#define onnx_TypeProto_init_default              {{{NULL}, NULL}}
+#define onnx_TypeProto_Tensor_init_default       {false, (onnx_TensorProto_DataType)0, {{NULL}, NULL}}
+#define onnx_OperatorSetIdProto_init_default     {{{NULL}, NULL}, false, 0}
+#define onnx_AttributeProto_init_zero            {{{NULL}, NULL}, false, 0, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, (onnx_AttributeProto_AttributeType)0}
+#define onnx_ValueInfoProto_init_zero            {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_NodeProto_init_zero                 {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_ModelProto_init_zero                {false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, false, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_StringStringEntryProto_init_zero    {{{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_GraphProto_init_zero                {{{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_TensorProto_init_zero               {{{NULL}, NULL}, false, (onnx_TensorProto_DataType)0, false, onnx_TensorProto_Segment_init_zero, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define onnx_TensorProto_Segment_init_zero       {false, 0, false, 0}
+#define onnx_TensorShapeProto_init_zero          {{{NULL}, NULL}}
+#define onnx_TensorShapeProto_Dimension_init_zero {false, 0, {{NULL}, NULL}}
+#define onnx_TypeProto_init_zero                 {{{NULL}, NULL}}
+#define onnx_TypeProto_Tensor_init_zero          {false, (onnx_TensorProto_DataType)0, {{NULL}, NULL}}
+#define onnx_OperatorSetIdProto_init_zero        {{{NULL}, NULL}, false, 0}
+
+/* Field tags (for use in manual encoding/decoding) */
+#define onnx_GraphProto_node_tag                 1
+#define onnx_GraphProto_name_tag                 2
+#define onnx_GraphProto_initializer_tag          5
+#define onnx_GraphProto_doc_string_tag           10
+#define onnx_GraphProto_input_tag                11
+#define onnx_GraphProto_output_tag               12
+#define onnx_GraphProto_value_info_tag           13
+#define onnx_NodeProto_input_tag                 1
+#define onnx_NodeProto_output_tag                2
+#define onnx_NodeProto_name_tag                  3
+#define onnx_NodeProto_op_type_tag               4
+#define onnx_NodeProto_domain_tag                7
+#define onnx_NodeProto_attribute_tag             5
+#define onnx_NodeProto_doc_string_tag            6
+#define onnx_StringStringEntryProto_key_tag      1
+#define onnx_StringStringEntryProto_value_tag    2
+#define onnx_TensorShapeProto_dim_tag            1
+#define onnx_TypeProto_tensor_type_tag           1
+#define onnx_ValueInfoProto_name_tag             1
+#define onnx_ValueInfoProto_type_tag             2
+#define onnx_ValueInfoProto_doc_string_tag       3
+#define onnx_AttributeProto_name_tag             1
+#define onnx_AttributeProto_doc_string_tag       13
+#define onnx_AttributeProto_type_tag             20
+#define onnx_AttributeProto_f_tag                2
+#define onnx_AttributeProto_i_tag                3
+#define onnx_AttributeProto_s_tag                4
+#define onnx_AttributeProto_t_tag                5
+#define onnx_AttributeProto_g_tag                6
+#define onnx_AttributeProto_floats_tag           7
+#define onnx_AttributeProto_ints_tag             8
+#define onnx_AttributeProto_strings_tag          9
+#define onnx_AttributeProto_tensors_tag          10
+#define onnx_AttributeProto_graphs_tag           11
+#define onnx_ModelProto_ir_version_tag           1
+#define onnx_ModelProto_opset_import_tag         8
+#define onnx_ModelProto_producer_name_tag        2
+#define onnx_ModelProto_producer_version_tag     3
+#define onnx_ModelProto_domain_tag               4
+#define onnx_ModelProto_model_version_tag        5
+#define onnx_ModelProto_doc_string_tag           6
+#define onnx_ModelProto_graph_tag                7
+#define onnx_ModelProto_metadata_props_tag       14
+#define onnx_OperatorSetIdProto_domain_tag       1
+#define onnx_OperatorSetIdProto_version_tag      2
+#define onnx_TensorProto_Segment_begin_tag       1
+#define onnx_TensorProto_Segment_end_tag         2
+#define onnx_TensorShapeProto_Dimension_dim_value_tag 1
+#define onnx_TensorShapeProto_Dimension_dim_param_tag 2
+#define onnx_TypeProto_Tensor_elem_type_tag      1
+#define onnx_TypeProto_Tensor_shape_tag          2
+#define onnx_TensorProto_dims_tag                1
+#define onnx_TensorProto_data_type_tag           2
+#define onnx_TensorProto_segment_tag             3
+#define onnx_TensorProto_float_data_tag          4
+#define onnx_TensorProto_int32_data_tag          5
+#define onnx_TensorProto_string_data_tag         6
+#define onnx_TensorProto_int64_data_tag          7
+#define onnx_TensorProto_name_tag                8
+#define onnx_TensorProto_doc_string_tag          12
+#define onnx_TensorProto_raw_data_tag            9
+#define onnx_TensorProto_double_data_tag         10
+#define onnx_TensorProto_uint64_data_tag         11
+
+/* Struct field encoding specification for nanopb */
+extern const pb_field_t onnx_AttributeProto_fields[14];
+extern const pb_field_t onnx_ValueInfoProto_fields[4];
+extern const pb_field_t onnx_NodeProto_fields[8];
+extern const pb_field_t onnx_ModelProto_fields[10];
+extern const pb_field_t onnx_StringStringEntryProto_fields[3];
+extern const pb_field_t onnx_GraphProto_fields[8];
+extern const pb_field_t onnx_TensorProto_fields[13];
+extern const pb_field_t onnx_TensorProto_Segment_fields[3];
+extern const pb_field_t onnx_TensorShapeProto_fields[2];
+extern const pb_field_t onnx_TensorShapeProto_Dimension_fields[3];
+extern const pb_field_t onnx_TypeProto_fields[2];
+extern const pb_field_t onnx_TypeProto_Tensor_fields[3];
+extern const pb_field_t onnx_OperatorSetIdProto_fields[3];
+
+/* Maximum encoded size of messages (where known) */
+/* onnx_AttributeProto_size depends on runtime parameters */
+/* onnx_ValueInfoProto_size depends on runtime parameters */
+/* onnx_NodeProto_size depends on runtime parameters */
+/* onnx_ModelProto_size depends on runtime parameters */
+/* onnx_StringStringEntryProto_size depends on runtime parameters */
+/* onnx_GraphProto_size depends on runtime parameters */
+/* onnx_TensorProto_size depends on runtime parameters */
+#define onnx_TensorProto_Segment_size            22
+/* onnx_TensorShapeProto_size depends on runtime parameters */
+/* onnx_TensorShapeProto_Dimension_size depends on runtime parameters */
+/* onnx_TypeProto_size depends on runtime parameters */
+/* onnx_TypeProto_Tensor_size depends on runtime parameters */
+/* onnx_OperatorSetIdProto_size depends on runtime parameters */
+
+/* Message IDs (where set with "msgid" option) */
+#ifdef PB_MSGID
+
+#define ONNX_MESSAGES \
+
+
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+/* @@protoc_insertion_point(eof) */
+
+#endif
diff --git a/torch/csrc/onnx/onnx.options b/torch/csrc/onnx/onnx.options
new file mode 100644
index 00000000000000..dd02d208eb7698
--- /dev/null
+++ b/torch/csrc/onnx/onnx.options
@@ -0,0 +1,24 @@
+# Note [Callback for nested messages]
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# nanopb's default translation for a nested, non-repeated (possibly
+# optional) message is to include it *inline* (no indirection), with
+# a boolean has_g/has_t field to indicate its presence or not.  Why
+# do we not like this?  It's not compatible with our ownership model,
+# where a TensorProto/GraphProto class owns the protobuf struct it
+# is constructing.  With the default translation, the protobuf struct
+# occurs in two places: a TensorProto, AND the parent protobuf struct
+# field.  That's bad.  Turning it back into a callback solves the
+# ownership problem.
+#
+# Two more bonuses: at the cost of an indirection, we no longer waste fields
+# when we aren't actually storing a graph/tensor; furthermore, circular
+# dependencies now work!
+
+onnx.AttributeProto.g type:FT_CALLBACK
+onnx.AttributeProto.t type:FT_CALLBACK
+onnx.ModelProto.graph type:FT_CALLBACK
+onnx.TypeProto.Tensor.shape type:FT_CALLBACK
+onnx.TypeProto.tensor_type type:FT_CALLBACK
+onnx.ValueInfoProto.type type:FT_CALLBACK
+onnx.TypeProto no_unions:true
+onnx.TensorShapeProto.Dimension no_unions:true
diff --git a/torch/csrc/utils/hash.h b/torch/csrc/utils/hash.h
index 954a7b5b7d0814..05a5a27b51223a 100644
--- a/torch/csrc/utils/hash.h
+++ b/torch/csrc/utils/hash.h
@@ -32,7 +32,7 @@ namespace torch {
 // DEALINGS IN THE SOFTWARE.
 
 inline size_t hash_combine(size_t seed, size_t value) {
-  return seed ^ (value + 0x9e3779b9 + (seed << 6u) + (seed >> 2u));
+  return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2));
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/torch/csrc/utils/invalid_arguments.cpp b/torch/csrc/utils/invalid_arguments.cpp
index 0160bdd2d8e506..f8d5fd1ba1cd63 100644
--- a/torch/csrc/utils/invalid_arguments.cpp
+++ b/torch/csrc/utils/invalid_arguments.cpp
@@ -16,7 +16,7 @@ std::string py_typename(PyObject *object) {
 
 struct Type {
   virtual bool is_matching(PyObject *object) = 0;
-  virtual ~Type() = default;
+  virtual ~Type() {};
 };
 
 struct SimpleType: public Type {
diff --git a/torch/csrc/utils/invalid_arguments.h b/torch/csrc/utils/invalid_arguments.h
index daaccfd877f377..138c3331113b7c 100644
--- a/torch/csrc/utils/invalid_arguments.h
+++ b/torch/csrc/utils/invalid_arguments.h
@@ -7,9 +7,7 @@
 namespace torch {
 
 std::string format_invalid_args(
-    PyObject* given_args,
-    PyObject* given_kwargs,
-    const std::string& function_name,
+    PyObject *args, PyObject *kwargs, const std::string& name,
     const std::vector<std::string>& options);
 
 } // namespace torch
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 0f2f51904c2554..b00bd27c087495 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -90,8 +90,8 @@ struct PythonArgParser {
 
 private:
   [[noreturn]]
-  void print_error(PyObject* args, PyObject* kwargs, PyObject* parsed_args[]);
-  PythonArgs raw_parse(PyObject* args, PyObject* kwargs, PyObject* parsed_args[]);
+  void print_error(PyObject* args, PyObject* kwargs, PyObject* dst[]);
+  PythonArgs raw_parse(PyObject* args, PyObject* kwargs, PyObject* dst[]);
 
   std::vector<FunctionSignature> signatures_;
   std::string function_name;
diff --git a/torch/csrc/utils/tensor_apply.h b/torch/csrc/utils/tensor_apply.h
index 5dfdef98c81db4..47fbaa672c4262 100644
--- a/torch/csrc/utils/tensor_apply.h
+++ b/torch/csrc/utils/tensor_apply.h
@@ -6,8 +6,8 @@
 namespace torch { namespace utils {
 
 at::Tensor & apply_(at::Tensor & self, PyObject* fn);
-at::Tensor & map_(at::Tensor & self, const at::Tensor & other_, PyObject* fn);
-at::Tensor & map2_(at::Tensor & self, const at::Tensor & x_,
-                   const at::Tensor & y_, PyObject* fn);
+at::Tensor & map_(at::Tensor & self, const at::Tensor & other, PyObject* fn);
+at::Tensor & map2_(at::Tensor & self, const at::Tensor & other1,
+                   const at::Tensor & other2, PyObject* fn);
 
 }} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index d03fd55f2accfc..3a8b4a7bbc1592 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -139,10 +139,8 @@ ScalarType infer_scalar_type(PyObject *obj) {
   }
 #ifdef USE_NUMPY
   if (PyArray_Check(obj)) {
-    return numpy_dtype_to_aten(PyArray_TYPE((PyArrayObject*)obj));
-  }
-  if (PyArray_CheckScalar(obj)) {
-    return numpy_dtype_to_aten(PyArray_TYPE((PyArrayObject*)(PyArray_FromScalar(obj, NULL))));
+    auto array = (PyArrayObject*)obj;
+    return numpy_dtype_to_aten(PyArray_TYPE(array));
   }
 #endif
   if (PySequence_Check(obj)) {
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index a2086ae95b899c..f8b26b121fd3e8 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -61,8 +61,7 @@ def init_process_group(backend, init_method='env://', **kwargs):
         group_name (str, optional): Group name. See description of init methods.
 
     To enable ``backend == mpi``, PyTorch needs to built from source on a system that
-    supports MPI. If you want to use Openmpi with CUDA-aware support, please use Openmpi
-    major version 2 and above.
+    supports MPI.
 
     """
     world_size = kwargs.pop('world_size', -1)
diff --git a/torch/distributions/__init__.py b/torch/distributions/__init__.py
index 47ee177c2cc959..ca961d88ba0a63 100644
--- a/torch/distributions/__init__.py
+++ b/torch/distributions/__init__.py
@@ -96,7 +96,6 @@
 from .lowrank_multivariate_normal import LowRankMultivariateNormal
 from .multinomial import Multinomial
 from .multivariate_normal import MultivariateNormal
-from .negative_binomial import NegativeBinomial
 from .normal import Normal
 from .one_hot_categorical import OneHotCategorical
 from .pareto import Pareto
@@ -130,7 +129,6 @@
     'LogisticNormal',
     'Multinomial',
     'MultivariateNormal',
-    'NegativeBinomial',
     'Normal',
     'OneHotCategorical',
     'Pareto',
diff --git a/torch/distributions/constraint_registry.py b/torch/distributions/constraint_registry.py
index f8688af3f3a392..a263082c967fe1 100644
--- a/torch/distributions/constraint_registry.py
+++ b/torch/distributions/constraint_registry.py
@@ -164,9 +164,7 @@ def _transform_to_positive(constraint):
 
 
 @biject_to.register(constraints.greater_than)
-@biject_to.register(constraints.greater_than_eq)
 @transform_to.register(constraints.greater_than)
-@transform_to.register(constraints.greater_than_eq)
 def _transform_to_greater_than(constraint):
     return transforms.ComposeTransform([transforms.ExpTransform(),
                                         transforms.AffineTransform(constraint.lower_bound, 1)])
@@ -180,9 +178,7 @@ def _transform_to_less_than(constraint):
 
 
 @biject_to.register(constraints.interval)
-@biject_to.register(constraints.half_open_interval)
 @transform_to.register(constraints.interval)
-@transform_to.register(constraints.half_open_interval)
 def _transform_to_interval(constraint):
     # Handle the special case of the unit interval.
     lower_is_0 = isinstance(constraint.lower_bound, numbers.Number) and constraint.lower_bound == 0
diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py
index 0b6eb53b0cd93a..18da2bff1392a4 100644
--- a/torch/distributions/constraints.py
+++ b/torch/distributions/constraints.py
@@ -27,10 +27,8 @@
     'dependent',
     'dependent_property',
     'greater_than',
-    'greater_than_eq',
     'integer_interval',
     'interval',
-    'half_open_interval',
     'is_dependent',
     'less_than',
     'lower_cholesky',
@@ -153,17 +151,6 @@ def check(self, value):
         return self.lower_bound < value
 
 
-class _GreaterThanEq(Constraint):
-    """
-    Constrain to a real half line `[lower_bound, inf)`.
-    """
-    def __init__(self, lower_bound):
-        self.lower_bound = lower_bound
-
-    def check(self, value):
-        return self.lower_bound <= value
-
-
 class _LessThan(Constraint):
     """
     Constrain to a real half line `[-inf, upper_bound)`.
@@ -187,18 +174,6 @@ def check(self, value):
         return (self.lower_bound <= value) & (value <= self.upper_bound)
 
 
-class _HalfOpenInterval(Constraint):
-    """
-    Constrain to a real interval `[lower_bound, upper_bound)`.
-    """
-    def __init__(self, lower_bound, upper_bound):
-        self.lower_bound = lower_bound
-        self.upper_bound = upper_bound
-
-    def check(self, value):
-        return (self.lower_bound <= value) & (value < self.upper_bound)
-
-
 class _Simplex(Constraint):
     """
     Constrain to the unit simplex in the innermost (rightmost) dimension.
@@ -265,11 +240,9 @@ def check(self, value):
 real_vector = _RealVector()
 positive = _GreaterThan(0.)
 greater_than = _GreaterThan
-greater_than_eq = _GreaterThanEq
 less_than = _LessThan
 unit_interval = _Interval(0., 1.)
 interval = _Interval
-half_open_interval = _HalfOpenInterval
 simplex = _Simplex()
 lower_triangular = _LowerTriangular()
 lower_cholesky = _LowerCholesky()
diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py
deleted file mode 100644
index 854ad5b7b087fa..00000000000000
--- a/torch/distributions/negative_binomial.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch.distributions import constraints
-from torch.distributions.distribution import Distribution
-from torch.distributions.utils import broadcast_all, probs_to_logits, lazy_property, logits_to_probs
-
-
-class NegativeBinomial(Distribution):
-    r"""
-    Creates a Negative Binomial distribution, i.e. distribution
-    of the number of independent identical Bernoulli trials
-    needed before `total_count` failures are achieved. The probability
-    of success of each Bernoulli trial is `probs`.
-
-    Args:
-        total_count (float or Tensor): non-negative number of negative Bernoulli
-            trials to stop, although the distribution is still valid for real
-            valued count
-        probs (Tensor): Event probabilities of success in the half open interval [0, 1)
-        logits (Tensor): Event log-odds for probabilities of success
-    """
-    arg_constraints = {'total_count': constraints.greater_than_eq(0),
-                       'probs': constraints.half_open_interval(0., 1.)}
-    support = constraints.nonnegative_integer
-
-    def __init__(self, total_count, probs=None, logits=None, validate_args=None):
-        if (probs is None) == (logits is None):
-            raise ValueError("Either `probs` or `logits` must be specified, but not both.")
-        if probs is not None:
-            self.total_count, self.probs, = broadcast_all(total_count, probs)
-            self.total_count = self.total_count.type_as(self.probs)
-        else:
-            self.total_count, self.logits, = broadcast_all(total_count, logits)
-            self.total_count = self.total_count.type_as(self.logits)
-
-        self._param = self.probs if probs is not None else self.logits
-        batch_shape = self._param.size()
-        super(NegativeBinomial, self).__init__(batch_shape, validate_args=validate_args)
-
-    def _new(self, *args, **kwargs):
-        return self._param.new(*args, **kwargs)
-
-    @property
-    def mean(self):
-        return self.total_count * torch.exp(self.logits)
-
-    @property
-    def variance(self):
-        return self.mean / torch.sigmoid(-self.logits)
-
-    @lazy_property
-    def logits(self):
-        return probs_to_logits(self.probs, is_binary=True)
-
-    @lazy_property
-    def probs(self):
-        return logits_to_probs(self.logits, is_binary=True)
-
-    @property
-    def param_shape(self):
-        return self._param.size()
-
-    @lazy_property
-    def _gamma(self):
-        return torch.distributions.Gamma(concentration=self.total_count,
-                                         rate=torch.exp(-self.logits))
-
-    def sample(self, sample_shape=torch.Size()):
-        with torch.no_grad():
-            rate = self._gamma.sample(sample_shape=sample_shape)
-            return torch.poisson(rate)
-
-    def log_prob(self, value):
-        if self._validate_args:
-            self._validate_sample(value)
-
-        log_unnormalized_prob = (self.total_count * F.logsigmoid(-self.logits) +
-                                 value * F.logsigmoid(self.logits))
-
-        log_normalization = (-torch.lgamma(self.total_count + value) + torch.lgamma(1. + value) +
-                             torch.lgamma(self.total_count))
-
-        return log_unnormalized_prob - log_normalization
diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py
index 0219942aac155a..ccc0ffffa2ec21 100644
--- a/torch/distributions/utils.py
+++ b/torch/distributions/utils.py
@@ -32,19 +32,30 @@ def _finfo(tensor):
     return _FINFO[tensor.storage_type()]
 
 
-# promote numbers to tensors of dtype torch.get_default_dtype()
-def _default_promotion(v):
-    return torch.tensor(v, dtype=torch.get_default_dtype())
+def _broadcast_shape(shapes):
+    r"""
+    Given a list of tensor sizes, returns the size of the resulting broadcasted
+    tensor.
+
+    Args:
+        shapes (list of torch.Size): list of tensor sizes
+    """
+    shape = torch.Size()
+    for s in shapes:
+        shape = torch._C._infer_size(s, shape)
+    return shape
 
 
 def broadcast_all(*values):
     r"""
     Given a list of values (possibly containing numbers), returns a list where each
     value is broadcasted based on the following rules:
-      - `torch.*Tensor` instances are broadcasted as per :ref:`_broadcasting-semantics`.
+      - `torch.*Tensor` instances are broadcasted as per the `broadcasting rules
+        <http://pytorch.org/docs/master/notes/broadcasting.html>`_
       - numbers.Number instances (scalars) are upcast to tensors having
         the same size and type as the first tensor passed to `values`.  If all the
-        values are scalars, then they are upcasted to scalar Tensors.
+        values are scalars, then they are upcasted to Tensors having size
+        `(1,)`.
 
     Args:
         values (list of `numbers.Number` or `torch.*Tensor`)
@@ -53,16 +64,22 @@ def broadcast_all(*values):
         ValueError: if any of the values is not a `numbers.Number` or
             `torch.*Tensor` instance
     """
-    if not all(torch.is_tensor(v) or isinstance(v, Number) for v in values):
+    values = list(values)
+    scalar_idxs = [i for i in range(len(values)) if isinstance(values[i], Number)]
+    tensor_idxs = [i for i in range(len(values)) if values[i].__class__.__name__ == 'Tensor']
+    if len(scalar_idxs) + len(tensor_idxs) != len(values):
         raise ValueError('Input arguments must all be instances of numbers.Number or torch.tensor.')
-    if not all(map(torch.is_tensor, values)):
-        new_tensor = _default_promotion
-        for value in values:
-            if torch.is_tensor(value):
-                new_tensor = value.new_tensor
-                break
-        values = [v if torch.is_tensor(v) else new_tensor(v) for v in values]
-    return torch.broadcast_tensors(*values)
+    if tensor_idxs:
+        broadcast_shape = _broadcast_shape([values[i].size() for i in tensor_idxs])
+        for idx in tensor_idxs:
+            values[idx] = values[idx].expand(broadcast_shape)
+        template = values[tensor_idxs[0]]
+        for idx in scalar_idxs:
+            values[idx] = template.new(template.size()).fill_(values[idx])
+    else:
+        for idx in scalar_idxs:
+            values[idx] = torch.tensor(float(values[idx]))
+    return values
 
 
 def _sum_rightmost(value, dim):
diff --git a/torch/functional.py b/torch/functional.py
index 0133a012981854..19d47f394fa757 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -10,7 +10,6 @@
     'argmin',
     'btrifact',
     'btriunpack',
-    'broadcast_tensors',
     'isfinite',
     'isinf',
     'isnan',
@@ -20,28 +19,6 @@
 ]
 
 
-def broadcast_tensors(*tensors):
-    r"""broadcast_tensors(*tensors) -> List of Tensors
-
-    Broadcasts the given tensors according to :ref:`_broadcasting-semantics`.
-
-    Args:
-        *tensors: any number of tensors of the same type
-
-    Example::
-
-        >>> x = torch.arange(3).view(1, 3)
-        >>> y = torch.arange(2).view(2, 1)
-        >>> a, b = torch.broadcast_tensors(x, y)
-        >>> a.size()
-        torch.Size([2, 3])
-        >>> a
-        tensor([[0, 1, 2],
-                [0, 1, 2]])
-    """
-    return torch._C._VariableFunctions.broadcast_tensors(tensors)
-
-
 def split(tensor, split_size_or_sections, dim=0):
     r"""Splits the tensor into chunks.
 
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index d09e970f729470..c0cf4f9d1c2e75 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -403,12 +403,9 @@ def wrapper(*args):
                 else:
                     new_args.append(arg)
             res = res_mod(*new_args)
-            assert len(res) % 3 == 0
-            if len(res) % 3 != 0:
-                raise "non-batched-tensor output is not supported yet"
-            result = [BatchTensor(*res[i * 3: i * 3 + 3]) for i in range(len(res) // 3)]
-            if len(result) == 1:
-                return result[0]
+            # assert len(res) / 3 == 0
+            # result = [BatchTensor(*res[i * 3: i * 3 + 3]) for i in range(len(res) // 3)]
+            result = BatchTensor(*res)
             return result
         wrapper.__doc__ = fn.__doc__
         return wrapper
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index 1db7749e07e34e..77e6cf777f2784 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -3,7 +3,7 @@
 import ast
 import inspect
 import torch
-from torch._C import DynamicType, TupleType, FloatType, IntType
+from torch._C import DynamicType, TupleType
 from textwrap import dedent
 
 
@@ -204,13 +204,9 @@ def as_ann(ann):
 
 def ann_to_type(ann):
     if ann is None:
-        return DynamicType.get()
+        return DynamicType()
     elif ann is torch.Tensor:
-        return DynamicType.get()
+        return DynamicType()
     elif is_tuple(ann):
         return TupleType([ann_to_type(a) for a in ann.__args__])
-    elif ann is float:
-        return FloatType.get()
-    elif ann is int:
-        return IntType.get()
     raise ValueError("The only supported annotations kinds are Tensor and Tuple[...]")
diff --git a/torch/jit/batchop.py b/torch/jit/batchop.py
index 053130dc0fb488..bda6a3adca3a88 100644
--- a/torch/jit/batchop.py
+++ b/torch/jit/batchop.py
@@ -1,9 +1,6 @@
 import torch
-from torch.jit import BatchTensor
 
 
-# TODO: there are some commented raise statements
-# when we support rasie exception in script, we want to check them
 @torch.jit.script
 def batch_tanh(data, mask, dims):
     data = torch.tanh(data)
@@ -17,52 +14,13 @@ def batch_sigmoid(data, mask, dims):
 
 
 @torch.jit.script
-def batch_relu(data, mask, dims):
-    data = torch.relu(data)
-    return data, mask, dims
-
-
-@torch.jit.script
-def batch_neg(data, mask, dims):
-    data = torch.neg(data)
-    return data, mask, dims
-
-
-@torch.jit.script
-def batch_neg_scalar(data):
-    return torch.neg(data)
-
-
-@torch.jit.script
-def batch_add(data1, mask1, dims1, data2, mask2, dims2, alpha_):
-    alpha = float(alpha_)
-    data = torch.add(data1, data2, alpha)
-    mask = mask1 * mask2
-    dims = dims1 or dims2
-    return data, mask, dims
-
-
-@torch.jit.script
-def batch_add_scalar(data, mask, dims, other, alpha_):
-    alpha = float(alpha_)
-    data = torch.add(data, other.type_as(data), alpha)
-    return data, mask, dims
-
-
-@torch.jit.script
-def batch_sub(data1, mask1, dims1, data2, mask2, dims2, alpha_):
-    alpha = float(alpha_)
-    data = torch.sub(data1, data2, alpha)
+def batch_add(data1, mask1, dims1, data2, mask2, dims2):
+    data = torch.add(data1, data2)
     mask = mask1 * mask2
     dims = dims1 or dims2
     return data, mask, dims
 
 
-@torch.jit.script
-def batch_sub_scalar(data1, data2):
-    return data1 - data2
-
-
 @torch.jit.script
 def batch_mul(data1, mask1, dims1, data2, mask2, dims2):
     data = torch.mul(data1, data2)
@@ -71,17 +29,6 @@ def batch_mul(data1, mask1, dims1, data2, mask2, dims2):
     return data, mask, dims
 
 
-@torch.jit.script
-def batch_mul_scalar(data1, data2):
-    return data1 * data2
-
-
-@torch.jit.script
-def batch_div(data, mask, dims, other):  # div(batchtensor, scalar)
-    data = torch.div(data, other)
-    return data, mask, dims
-
-
 @torch.jit.script
 def batch_mm(data1, mask1, dims1, data2, mask2, dims2):
     data1 = data1 * mask1.type_as(data1)
@@ -141,388 +88,26 @@ def batch_select(data, mask, dims, dim_, index_):
     #     raise ValueError("Cannot select 0 dim in BatchTensor")
     data = data.select(dim, index)
     if dims[dim - 1]:
-        mask = mask.select(dim, index)
-    else:
         mask = mask.select(dim, 0)
+    else:
+        mask = mask.select(dim, index)
     dims = torch.cat((dims[:dim - 1], dims[dim:dims.size(0)]))
     return data, mask, dims
 
 
-@torch.jit.script
-def batch_fmod(data, mask, dims, other_):
-    other = int(other_)
-    data = torch.fmod(data, other)
-    return data, mask, dims
-
-
-@torch.jit.script
-def batch_zeros_like(data, mask, dims):
-    res_data = torch.zeros_like(data)
-    return res_data, mask, dims
-
-
-@torch.jit.script
-def batch_index_select(data, mask, dims, dim_, index_data, index_mask, index_dims):
-    dim = int(dim_)
-    # if dim == 0:
-    #     raise ValueError("Cannot index_select along 0 dim in BatchTensor")
-    batch_size = data.size(0)  # TODO maybe index_mask will be used at some point
-    res_data = torch.zeros([0])
-    res_mask = torch.zeros([0])
-    for i in range(batch_size):
-        d = data[i].index_select(dim - 1, index_data[i]).unsqueeze(0)
-        if dims[dim - 1]:
-            m = mask[i].index_select(dim - 1, index_data[i]).unsqueeze(0)
-        else:
-            m = mask[i].unsqueeze(0)
-        if i == 0:
-            res_data = d
-            res_mask = m
-        else:
-            res_data = torch.cat((res_data, d), 0)
-            res_mask = torch.cat((res_mask, m), 0)
-    return res_data, res_mask, dims
-
-
-@torch.jit.script
-def batch_view_as(data, mask, dims, data1, mask1, dims1):
-    # if data.size(0) != data1.size(0):
-    #     raise ValueError("In view_as, tensor and target tensor should have the same batch_size")
-    # if not torch.equal(dims, dims1):
-    #     raise ValueError("In batched view_as, dims and target dims should be the same")
-    data = data.view_as(data1)
-    mask = mask.view_as(mask1)
-    dims = dims1
-    return data, mask, dims
-
-
 # assume data, data1, data2 have same size
 @torch.jit.script
 def batch_where(data, mask, dims, data1, mask1, dims1, data2, mask2, dims2):
-    data = data * mask.type_as(data)
-    cond_data = data
-    cond_mask = data
-    if data.dim() == 1:
-        for _ in range(data1.dim() - 1):
-            data = data.unsqueeze(data.dim())
-        cond_data = data.expand_as(data1)
-        cond_mask = data.expand_as(mask1)
-    res_data = torch.where(cond_data, data1, data2)
-    res_mask = torch.where(cond_mask, mask1, mask2)
+    res_data = torch.where(data, data1, data2)
+    res_mask = torch.where(data, mask1, mask2)
     res_dims = dims1 or dims2
     return res_data, res_mask, res_dims
 
-
-@torch.jit.script
-def batch_where_scalar(cond_, data1, mask1, dims1, data2, mask2, dims2):
-    cond = torch.zeros([1], dtype=torch.uint8) * cond_
-    res_data = torch.where(cond, data1, data2)
-    res_mask = torch.where(cond, mask1, mask2)
-    res_dims = torch.where(cond, dims1, dims2)
-    return res_data, res_mask, res_dims
-
-
-@torch.jit.script
-def batch_update(batch_data, batch_mask, batch_dims, new_data, new_mask, new_dims):
-    data = torch.where(new_mask, new_data, batch_data)
-    return data, new_mask, new_dims  # TODO: consider whether return new_mask and new_dims
-
-
-@torch.jit.script
-def batch_any(data, mask, dims):
-    return torch.gt(torch.sum(data * mask), 0)
-
-
-@torch.jit.script
-def batch_type_as(data, mask, dims, data1, mask1, dims1):
-    return data.type_as(data1), mask, dims
-
-
-@torch.jit.script
-def batch_gt(data, mask, dims, data1, mask1, dims1):
-    return torch.gt(data, data1), mask * mask1, dims or dims1
-
-
-@torch.jit.script
-def batch_gt_scalar(data1, data2):
-    return torch.gt(data1, data2)
-
-
-@torch.jit.script
-def batch_gt_one_scalar(data, mask, dims, other_):
-    other = float(other_)
-    return torch.gt(data, other), mask, dims
-
-
-@torch.jit.script
-def batch_lt(data, mask, dims, data1, mask1, dims1):
-    return torch.lt(data, data1), mask * mask1, dims or dims1
-
-
-@torch.jit.script
-def batch_eq(data, mask, dims, data1, mask1, dims1):
-    return torch.eq(data, data1), mask * mask1, dims or dims1
-
-
-@torch.jit.script
-def batch_size(data, mask, dims, dim_):
-    dim = int(dim_)
-    return data.size(dim)
-
-
-@torch.jit.script
-def batch_dim(data, mask, dims):
-    return data.dim()
-
-
-@torch.jit.script
-def batch_squeeze(data, mask, dims, dim_):
-    if int(dim_) < 0:
-        dim_ += data.dim()
-    dim = int(dim_)
-    # if dim == 0:
-    #     raise ValueError("cannot do squeeze along batch_dim")
-    data = data.squeeze(dim)
-    mask = mask.squeeze(dim)
-    dims = torch.cat((dims[:dim - 1], dims[dim:dims.size(0)]))
-    return data, mask, dims
-
-
-@torch.jit.script
-def batch_unsqueeze(data, mask, dims, dim_):
-    if int(dim_) < 0:
-        dim_ += data.dim() + 1
-    dim = int(dim_)
-    # if dim == 0:
-    #     raise ValueError("cannot do unsqueeze along batch_dim")
-    data = data.unsqueeze(dim)
-    mask = mask.unsqueeze(dim)
-    dims = torch.cat((dims[:dim], torch.zeros([1], dtype=torch.uint8), dims[dim:dims.size(0)]))
-    return data, mask, dims
-
-
-@torch.jit.script
-def batch_argmax(data, mask, dims, dim_, keepdim_):
-    dim = int(dim_)
-    keepdim = int(keepdim_)
-    # if dim == 0:
-    #     raise ValueError("cannot do argmax along batch_dim")
-    batch_size = data.size(0)
-    res_data = torch.zeros([0])
-    for i in range(batch_size):
-        if dims[dim - 1]:
-            if dim - 1 != 0:
-                m = mask[i].transpose(0, dim - 1)
-            else:
-                m = mask[i]
-            valid_num = m.sum(0, keepdim=True)
-            while(valid_num.dim() >= 1):
-                valid_num = valid_num[0]
-            d = data[i].unsqueeze(0).narrow(dim, 0, int(valid_num))
-        else:
-            d = data[i].unsqueeze(0)
-        d = d.argmax(dim, keepdim)
-        if i == 0:
-            res_data = d
-        else:
-            res_data = torch.cat([res_data, d], 0)
-    if keepdim:
-        mask = mask
-    else:
-        mask = mask.select(dim, 0)
-        dims = torch.cat((dims[:dim - 1], dims[dim:dims.size(0)]))
-    return res_data, mask, dims
-
-
-@torch.jit.script
-def batch_topk(data, mask, dims, k_, dim_, largest_, sorted_):
-    k = int(k_)
-    dim = int(dim_)
-    largest = int(largest_)
-    sorted = int(sorted_)
-    # if dim == 0:
-    #     raise ValueError("cannot do topk along batch_dim")
-    batch_size = data.size(0)
-    res_data = torch.zeros([0])
-    res_index = torch.zeros([0])
-    for i in range(batch_size):
-        if dims[dim - 1]:
-            if dim - 1 != 0:
-                m = mask[i].transpose(0, dim - 1)
-            else:
-                m = mask[i]
-            valid_num = m.sum(0, keepdim=True)
-            while(valid_num.dim() >= 1):
-                valid_num = valid_num[0]
-            d = data[i].unsqueeze(0).narrow(dim, 0, int(valid_num))
-        else:
-            d = data[i].unsqueeze(0)
-        d, idx = d.topk(k, dim, largest, sorted)
-        if i == 0:
-            res_data = d
-            res_index = idx
-        else:
-            res_data = torch.cat([res_data, d], 0)
-            res_index = torch.cat([res_index, idx], 0)
-    if dims[dim - 1]:
-        mask = mask.narrow(dim, 0, k)
-    return res_data, mask, dims, res_index, mask, dims
-
-
-@torch.jit.script
-def batch_softmax(data, mask, dims, dim_):
-    dim = int(dim_)
-    # if dim == 0:
-    #     raise ValueError("cannot do softmax along batch_dim")
-    batch_size = data.size(0)
-    max_len = data.size(dim)
-    res_data = torch.zeros([0])
-    for i in range(batch_size):
-        if dims[dim - 1]:
-            if dim - 1 != 0:
-                m = mask[i].transpose(0, dim - 1)
-            else:
-                m = mask[i]
-            valid_num = m.sum(0, keepdim=True)
-            while(valid_num.dim() >= 1):
-                valid_num = valid_num[0]
-            valid_num = int(valid_num)
-            d = data[i].unsqueeze(0).narrow(dim, 0, valid_num).softmax(dim)
-            if valid_num < max_len:
-                d = torch.cat([d, data[i].unsqueeze(0).narrow(dim, valid_num, max_len - valid_num)], dim)
-        else:
-            d = data[i].unsqueeze(0).softmax(dim)
-        if i == 0:
-            res_data = d
-        else:
-            res_data = torch.cat([res_data, d], 0)
-    return res_data, mask, dims
-
-
-# size argument in dynamic dimension has to be -1
-# in static dimension, size has to be specified, -1 is not supported
-@torch.jit.script
-def batch_view(data, mask, dims, sizes):
-    batch_size = data.size(0)
-    # if(sizes[0] != batch_size and sizes[0] != -1 and sizes[0] != 1):
-    #     raise "first dim in view must be 1, -1, or batch size"
-    # for i in range(dims.size(0)):
-    #     if dims[0] == 1 and sizes[i + 1] != -1:
-    #         raise "size argument in dynamic dimension has to be -1"
-    sizes = sizes.type_as(torch.ones([1], dtype=torch.int))
-    data_sizes_ = torch.cat([torch.ones([1], dtype=torch.int) * batch_size, sizes.narrow(0, 1, sizes.size(0) - 1)], 0)
-    data_sizes = data_sizes_._tensor_to_list()
-    res_data = data.view(data_sizes)
-    mask_sizes_ = data_sizes_.narrow(0, 0, 1)
-    res_dims = data_sizes_.narrow(0, 0, 1)
-    for i_ in range(sizes.size(0) - 1):
-        i = i_ + 1
-        if(sizes[i] == -1):
-            cur_size_ = mask.size(i)
-            cur_dim = 1
-        else:
-            cur_size_ = 1
-            cur_dim = 0
-        mask_sizes_ = torch.cat([mask_sizes_, torch.ones([1], dtype=torch.int) * cur_size_])
-        res_dims = torch.cat([res_dims, torch.ones([1], dtype=torch.int) * cur_dim])
-    mask_sizes = mask_sizes_._tensor_to_list()
-    res_mask = mask.view(mask_sizes)
-    return res_data, res_mask, res_dims.narrow(0, 1, res_dims.size(0) - 1).type_as(dims)
-
-
-@torch.jit.script
-def batch_cat2(data1, mask1, dims1, data2, mask2, dims2, dim_):
-    dim = int(dim_)
-    data = torch.cat([data1, data2], dim)
-    if(dims1[dim - 1]):
-        mask = torch.cat([mask1, mask2], dim)
-    else:
-        mask = mask1
-    return data, mask, dims1
-
-
-@torch.jit.script
-def batch_cat3(data1, mask1, dims1, data2, mask2, dims2, data3, mask3, dims3, dim_):
-    dim = int(dim_)
-    data = torch.cat([data1, data2, data3], dim)
-    if(dims1[dim - 1]):
-        mask = torch.cat([mask1, mask2, mask3], dim)
-    else:
-        mask = mask1
-    return data, mask, dims1
-
-
-@torch.jit.script
-def batch_narrow(data, mask, dims, dimension_, start_, length_):
-    dimension = int(dimension_)
-    start = int(start_)
-    length = int(length_)
-    # if dimension == 0:
-    #     raise ValueError("cannot do narrow along batch_dim")
-    data = data.narrow(dimension, start, length)
-    if dims[dimension - 1]:
-        mask = mask.narrow(dimension, start, length)
-    else:
-        mask = mask.narrow(dimension, 0, 1)
-    return data, mask, dims
-
-
-@torch.jit.script
-def batch_sum(data, mask, dims):
-    data = data * mask.type_as(data)
-    for _ in range(dims.size(0)):
-        data = data.sum(1)
-    mask = torch.ones([data.size(0)], dtype=torch.uint8)
-    dims = dims[:0]  # empty tensor
-    return data, mask, dims
-
-
-@torch.jit.script
-def batch_from_scalar_tensor(data):
-    data = data.unsqueeze(0)
-    mask = torch.ones([1], dtype=torch.uint8)
-    dims = torch.zeros([0], dtype=torch.uint8)
-    return data, mask, dims
-
 torch.register_batch_operator("tanh", batch_tanh.graph)
 torch.register_batch_operator("sigmoid", batch_sigmoid.graph)
-torch.register_batch_operator("relu", batch_relu.graph)
-torch.register_batch_operator("neg", batch_neg.graph)
-torch.register_batch_operator("neg", batch_neg_scalar.graph)
 torch.register_batch_operator("add", batch_add.graph)
-torch.register_batch_operator("add", batch_add_scalar.graph)
-torch.register_batch_operator("sub", batch_sub.graph)
-torch.register_batch_operator("sub", batch_sub_scalar.graph)
 torch.register_batch_operator("mul", batch_mul.graph)
-torch.register_batch_operator("mul", batch_mul_scalar.graph)
-torch.register_batch_operator("div", batch_div.graph)
 torch.register_batch_operator("matmul", batch_matmul.graph)
 torch.register_batch_operator("mm", batch_mm.graph)
-torch.register_batch_operator("fmod", batch_fmod.graph)
-torch.register_batch_operator("zeros_like", batch_zeros_like.graph)
 torch.register_batch_operator("select", batch_select.graph)
-torch.register_batch_operator("index_select", batch_index_select.graph)
-torch.register_batch_operator("view_as", batch_view_as.graph)
 torch.register_batch_operator("where", batch_where.graph)
-torch.register_batch_operator("where", batch_where_scalar.graph)
-torch.register_batch_operator("update", batch_update.graph)
-torch.register_batch_operator("any", batch_any.graph)
-torch.register_batch_operator("type_as", batch_type_as.graph)
-torch.register_batch_operator("gt", batch_gt.graph)
-torch.register_batch_operator("gt", batch_gt_scalar.graph)
-torch.register_batch_operator("gt", batch_gt_one_scalar.graph)
-torch.register_batch_operator("lt", batch_lt.graph)
-torch.register_batch_operator("eq", batch_eq.graph)
-torch.register_batch_operator("size", batch_size.graph)
-torch.register_batch_operator("dim", batch_dim.graph)
-torch.register_batch_operator("squeeze", batch_squeeze.graph)
-torch.register_batch_operator("unsqueeze", batch_unsqueeze.graph)
-torch.register_batch_operator("argmax", batch_argmax.graph)
-torch.register_batch_operator("topk", batch_topk.graph)
-torch.register_batch_operator("softmax", batch_softmax.graph)
-torch.register_batch_operator("view", batch_view.graph)
-torch.register_batch_operator("cat", batch_cat2.graph)
-torch.register_batch_operator("cat", batch_cat3.graph)
-torch.register_batch_operator("narrow", batch_narrow.graph)
-torch.register_batch_operator("sum", batch_sum.graph)
-torch.register_batch_operator("batch_from_scalar_tensor", batch_from_scalar_tensor.graph)
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index bc979d15141121..d152b2010fcae4 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -435,8 +435,8 @@ def build_List(ctx, expr):
 
     @staticmethod
     def build_Tuple(ctx, expr):
-        return TupleLiteral(ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1),
-                            [build_expr(ctx, e) for e in expr.elts])
+        return ListLiteral(ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1),
+                           [build_expr(ctx, e) for e in expr.elts])
 
     @staticmethod
     def build_Num(ctx, expr):
diff --git a/torch/legacy/nn/ELU.py b/torch/legacy/nn/ELU.py
index 9e00e8a172fc88..6ad240658a9e28 100644
--- a/torch/legacy/nn/ELU.py
+++ b/torch/legacy/nn/ELU.py
@@ -23,7 +23,6 @@ def updateOutput(self, input):
             self.output,
             self.alpha,
             1.0,
-            1.0,
             self.inplace
         )
         return self.output
@@ -35,7 +34,6 @@ def updateGradInput(self, input, gradOutput):
             self.gradInput,
             self.output,
             self.alpha,
-            1.0,
             1.0
         )
         return self.gradInput
diff --git a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
index b23157581bdfc0..cc176931d8c0c2 100644
--- a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
+++ b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
@@ -100,14 +100,6 @@ void DataChannelMPI::destroy() {}
 
 
 bool DataChannelMPI::init() {
-#ifdef OMPI_MAJOR_VERSION
-  // OMPI_* is specific to Openmpi implementation.
-  // Openmpi v1.10 segfaults in MPI_Bcast with CUDA buffer.
-  if (int(OMPI_MAJOR_VERSION) < 2) {
-      throw std::runtime_error("Please use Openmpi major version 2 and above for distributed.");
-  }
-#endif /* OMPI_MAJOR_VERSION */
-
   int provided;
   MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided);
   if (provided != MPI_THREAD_MULTIPLE) {
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp
index 9bb0ef0e98ca82..26f6c480420b67 100644
--- a/torch/lib/c10d/Utils.hpp
+++ b/torch/lib/c10d/Utils.hpp
@@ -64,7 +64,7 @@ inline std::vector<std::vector<int64_t>> getSizes(
     const std::vector<at::Tensor>& tensors) {
   std::vector<std::vector<int64_t>> sizes(tensors.size());
   for (size_t i = 0; i < tensors.size(); i++) {
-    sizes[i] = tensors[i].sizes().vec();
+    sizes[i] = tensors[i].sizes();
   }
   return sizes;
 }
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 746c2664529175..17a7c09b012da6 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -741,25 +741,6 @@ def selu(input, inplace=False):
 """)
 
 
-def celu(input, alpha=1., inplace=False):
-    r"""celu(input, alpha=1., inplace=False) -> Tensor
-
-    Applies element-wise,
-    :math:`\text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1))`.
-
-    See :class:`~torch.nn.CELU` for more details.
-    """
-    if inplace:
-        return torch.celu_(input, alpha)
-    return torch.celu(input, alpha)
-
-celu_ = _add_docstr(torch.celu_, r"""
-celu_(input, alpha=1.) -> Tensor
-
-In-place version of :func:`~celu`.
-""")
-
-
 def leaky_relu(input, negative_slope=0.01, inplace=False):
     r"""
     leaky_relu(input, negative_slope=0.01, inplace=False) -> Tensor
@@ -878,7 +859,7 @@ def softmin(input, dim=None, _stacklevel=3):
     """
     if dim is None:
         dim = _get_softmax_dim('softmin', input.dim(), _stacklevel)
-    return (-input).softmax(dim)
+    return -input.softmax(dim)
 
 
 def softmax(input, dim=None, _stacklevel=3):
@@ -1118,7 +1099,7 @@ def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2,
             assert padding_idx >= -weight.size(0), 'Padding_idx must be within num_embeddings'
             padding_idx = weight.size(0) + padding_idx
     elif padding_idx is None:
-        padding_idx = -1
+            padding_idx = -1
     if max_norm is not None:
         # `embedding_renorm_` will call .contiguous() on input anyways, so we
         # call it here and take advantage of the improved locality in the
@@ -1369,41 +1350,6 @@ def local_response_norm(input, size, alpha=1e-4, beta=0.75, k=1):
 
 # loss
 
-def ctc_loss(log_probs, targets, input_lengths, target_lengths, blank=0,
-             reduction='elementwise_mean'):
-    r"""The Connectionist Temporal Classification loss.
-
-    See :class:`~torch.nn.CTCLoss` for details.
-
-    Args:
-        log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`,
-            `T = input length`, and `N = batch size`.
-            The logarithmized probabilities of the outputs
-            (e.g. obtained with :func:`torch.nn.functional.log_softmax`).
-        targets: :math:`(N, S)` or `(sum(target_lenghts))`.
-            Targets (cannot be blank). In the second form, the targets are assumed to be concatenated.
-        input_lengths: :math:`(N)`.
-            Lengths of the inputs (must each be :math:`\leq T`)
-        target_lengths: :math:`(N)`.
-            Lengths of the targets
-        blank (int, optional):
-            Blank label. Default :math:`0`.
-        reduction (string, optional): Specifies the reduction to apply to the output:
-            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
-            'elementwise_mean': the output losses will be divided by the target lengths and
-            then the mean over the batch is taken. Default: 'elementwise_mean'
-
-    Example::
-
-        >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_()
-        >>> targets = torch.randint(1, 21, (16, 30), dtype=torch.long)
-        >>> input_lengths = torch.full((16,), 50, dtype=torch.long)
-        >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long)
-        >>> loss = F.ctc_loss(log_probs, targets, input_lengths, target_lengths)
-        >>> loss.backward()
-    """
-    return torch.ctc_loss(log_probs, targets, input_lengths, target_lengths, blank, _Reduction.get_enum(reduction))
-
 
 def nll_loss(input, target, weight=None, size_average=None, ignore_index=-100,
              reduce=None, reduction='elementwise_mean'):
@@ -1725,7 +1671,7 @@ def _pointwise_loss(lambd, lambd_optimized, input, target, reduction='elementwis
             return d
         return torch.mean(d) if reduction == 'elementwise_mean' else torch.sum(d)
     else:
-        return lambd_optimized(input, target, _Reduction.get_enum(reduction))
+        return lambd_optimized(input, target, reduction)
 
 
 def smooth_l1_loss(input, target, size_average=None, reduce=None, reduction='elementwise_mean'):
@@ -1749,7 +1695,9 @@ def l1_loss(input, target, size_average=None, reduce=None, reduction='elementwis
     See :class:`~torch.nn.L1Loss` for details.
     """
     if size_average is not None or reduce is not None:
-        reduction = _Reduction.legacy_get_string(size_average, reduce)
+        reduction = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction = _Reduction.get_enum(reduction)
     return _pointwise_loss(lambda a, b: torch.abs(a - b), torch._C._nn.l1_loss,
                            input, target, reduction)
 
@@ -1762,7 +1710,9 @@ def mse_loss(input, target, size_average=None, reduce=None, reduction='elementwi
     See :class:`~torch.nn.MSELoss` for details.
     """
     if size_average is not None or reduce is not None:
-        reduction = _Reduction.legacy_get_string(size_average, reduce)
+        reduction = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction = _Reduction.get_enum(reduction)
     return _pointwise_loss(lambda a, b: (a - b) ** 2, torch._C._nn.mse_loss, input, target, reduction)
 
 
diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py
index 6c66f8d43f005f..4d98f482768a63 100644
--- a/torch/nn/modules/__init__.py
+++ b/torch/nn/modules/__init__.py
@@ -3,10 +3,10 @@
 from .conv import Conv1d, Conv2d, Conv3d, \
     ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
 from .activation import Threshold, ReLU, Hardtanh, ReLU6, Sigmoid, Tanh, \
-    Softmax, Softmax2d, LogSoftmax, ELU, SELU, CELU, Hardshrink, LeakyReLU, LogSigmoid, \
+    Softmax, Softmax2d, LogSoftmax, ELU, SELU, Hardshrink, LeakyReLU, LogSigmoid, \
     Softplus, Softshrink, PReLU, Softsign, Softmin, Tanhshrink, RReLU, GLU
 from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, BCEWithLogitsLoss, NLLLoss2d, \
-    CosineEmbeddingLoss, CTCLoss, HingeEmbeddingLoss, MarginRankingLoss, \
+    CosineEmbeddingLoss, HingeEmbeddingLoss, MarginRankingLoss, \
     MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, \
     SmoothL1Loss, SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, PoissonNLLLoss
 from .container import Container, Sequential, ModuleList, ModuleDict, ParameterList, ParameterDict
@@ -31,10 +31,10 @@
 __all__ = [
     'Module', 'Linear', 'Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d',
     'ConvTranspose2d', 'ConvTranspose3d', 'Threshold', 'ReLU', 'Hardtanh', 'ReLU6',
-    'Sigmoid', 'Tanh', 'Softmax', 'Softmax2d', 'LogSoftmax', 'ELU', 'SELU', 'CELU', 'GLU', 'Hardshrink',
+    'Sigmoid', 'Tanh', 'Softmax', 'Softmax2d', 'LogSoftmax', 'ELU', 'SELU', 'GLU', 'Hardshrink',
     'LeakyReLU', 'LogSigmoid', 'Softplus', 'Softshrink', 'PReLU', 'Softsign', 'Softmin',
     'Tanhshrink', 'RReLU', 'L1Loss', 'NLLLoss', 'KLDivLoss', 'MSELoss', 'BCELoss', 'BCEWithLogitsLoss',
-    'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'CTCLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss',
+    'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss',
     'MultiLabelMarginLoss', 'MultiLabelSoftMarginLoss', 'MultiMarginLoss', 'SmoothL1Loss',
     'SoftMarginLoss', 'CrossEntropyLoss', 'Container', 'Sequential', 'ModuleList', 'ModuleDict',
     'ParameterList', 'ParameterDict', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'MaxPool1d', 'MaxPool2d',
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 51cfab79404145..d372a2cae21d2c 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -118,7 +118,6 @@ class RReLU(Module):
     .. _`Empirical Evaluation of Rectified Activations in Convolutional Network`:
         https://arxiv.org/abs/1505.00853
     """
-
     def __init__(self, lower=1. / 8, upper=1. / 3, inplace=False):
         super(RReLU, self).__init__()
         self.lower = lower
@@ -300,46 +299,6 @@ def extra_repr(self):
         return 'alpha={}{}'.format(self.alpha, inplace_str)
 
 
-class CELU(Module):
-    r"""Applies element-wise,
-    :math:`\text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1))`
-
-    More details can be found in the paper `Continuously Differentiable Exponential Linear Units`_ .
-
-    Args:
-        alpha: the :math:`\alpha` value for the CELU formulation. Default: 1.0
-        inplace: can optionally do the operation in-place. Default: ``False``
-
-    Shape:
-        - Input: :math:`(N, *)` where `*` means, any number of additional
-          dimensions
-        - Output: :math:`(N, *)`, same shape as the input
-
-    .. image:: scripts/activation_images/CELU.png
-
-    Examples::
-
-        >>> m = nn.CELU()
-        >>> input = torch.randn(2)
-        >>> output = m(input)
-
-    .. _`Continuously Differentiable Exponential Linear Units`:
-        https://arxiv.org/abs/1704.07483
-    """
-
-    def __init__(self, alpha=1., inplace=False):
-        super(CELU, self).__init__()
-        self.alpha = alpha
-        self.inplace = inplace
-
-    def forward(self, input):
-        return F.celu(input, self.alpha, self.inplace)
-
-    def extra_repr(self):
-        inplace_str = ', inplace' if self.inplace else ''
-        return 'alpha={}{}'.format(self.alpha, inplace_str)
-
-
 class SELU(Module):
     r"""Applies element-wise,
     :math:`\text{SELU}(x) = \text{scale} * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1)))`,
@@ -709,7 +668,6 @@ class Softmin(Module):
         >>> input = torch.randn(2, 3)
         >>> output = m(input)
     """
-
     def __init__(self, dim=None):
         super(Softmin, self).__init__()
         self.dim = dim
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index ec7d60d8125152..489e8998843f98 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1123,61 +1123,6 @@ def forward(self, anchor, positive, negative):
         return F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p,
                                      eps=self.eps, swap=self.swap, reduction=self.reduction)
 
-
-class CTCLoss(_Loss):
-    r"""The Connectionist Temporal Classification loss.
-
-    Args:
-        blank (int, optional): blank label. Default :math:`0`.
-        reduction (string, optional): Specifies the reduction to apply to the output:
-            'none' | 'elementwise_mean' | 'sum'. 'none': no reduction will be applied,
-            'elementwise_mean': the output losses will be divided by the target lengths and
-            then the mean over the batch is taken. Default: 'elementwise_mean'
-
-    Inputs:
-        log_probs: :math:`(T, N, C)` where `C = number of characters in alphabet including blank`,
-            `T = input length`, and `N = batch size`.
-            The logarithmized probabilities of the outputs
-            (e.g. obtained with :func:`torch.nn.functional.log_softmax`).
-        targets: :math:`(N, S)` or `(sum(target_lenghts))`.
-            Targets (cannot be blank). In the second form, the targets are assumed to be concatenated.
-        input_lengths: :math:`(N)`.
-            Lengths of the inputs (must each be :math:`\leq T`)
-        target_lengths: :math:`(N)`.
-            Lengths of the targets
-
-
-    Example::
-
-        >>> ctc_loss = nn.CTCLoss()
-        >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_()
-        >>> targets = torch.randint(1, 21, (16, 30), dtype=torch.long)
-        >>> input_lengths = torch.full((16,), 50, dtype=torch.long)
-        >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long)
-        >>> loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
-        >>> loss.backward()
-
-    Reference:
-        A. Graves et al.: Connectionist Temporal Classification:
-        Labelling Unsegmented Sequence Data with Recurrent Neural Networks:
-        https://www.cs.toronto.edu/~graves/icml_2006.pdf
-
-    .. Note::
-        In order to use CuDNN, the following must be satisfied: :attr:`targets` must be
-        in concatenated format, all :attr:`input_lengths` must be `T`.  :math:`blank=0`,
-        :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of
-        :class:`torch.IntTensor`.
-
-        The regular implementation uses the (more common in PyTorch) `torch.long` dtype.
-    """
-
-    def __init__(self, blank=0, reduction='elementwise_mean'):
-        super(CTCLoss, self).__init__(reduction=reduction)
-        self.blank = blank
-
-    def forward(self, log_probs, targets, input_lengths, target_lengths):
-        return F.ctc_loss(log_probs, targets, input_lengths, target_lengths, self.blank, self.reduction)
-
 # TODO: L1HingeEmbeddingCriterion
 # TODO: MSECriterion weight
 # TODO: ClassSimplexCriterion
diff --git a/torch/nn/parallel/distributed_c10d.py b/torch/nn/parallel/distributed_c10d.py
index 424670ac76fc14..c2b32cb97b6b01 100644
--- a/torch/nn/parallel/distributed_c10d.py
+++ b/torch/nn/parallel/distributed_c10d.py
@@ -242,7 +242,11 @@ def train(self, mode=True):
             module.train(mode)
 
     def _dist_broadcast_coalesced(self, tensors, buffer_size):
-        c10d._dist_broadcast_coalesced(tensors, buffer_size, self.process_group)
+        for tensors in _take_tensors(tensors, buffer_size):
+            flat_tensors = _flatten_dense_tensors(tensors)
+            c10d.broadcast(flat_tensors, 0, self.process_group).wait()
+            for tensor, synced in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
+                tensor.copy_(synced)
 
     def _sync_params(self):
         if len(self.device_ids) > 1:
diff --git a/torch/nn/utils/convert_parameters.py b/torch/nn/utils/convert_parameters.py
index 36a7eb207bcc65..7f0dd1666dad9c 100644
--- a/torch/nn/utils/convert_parameters.py
+++ b/torch/nn/utils/convert_parameters.py
@@ -45,9 +45,9 @@ def vector_to_parameters(vec, parameters):
         param_device = _check_param_device(param, param_device)
 
         # The length of the parameter
-        num_param = param.numel()
+        num_param = torch.prod(torch.LongTensor(list(param.size())))
         # Slice the vector, reshape it, and replace the old data of the parameter
-        param.data = vec[pointer:pointer + num_param].view_as(param).data
+        param.data = vec[pointer:pointer + num_param].view(param.size()).data
 
         # Increment the pointer
         pointer += num_param
diff --git a/torch/onnx/symbolic.py b/torch/onnx/symbolic.py
index 3262ca282b2c5d..3ca44f35c4eff3 100644
--- a/torch/onnx/symbolic.py
+++ b/torch/onnx/symbolic.py
@@ -70,12 +70,6 @@ def _get_const(value, desc, arg_name):
     return _parse_arg(value, desc)
 
 
-def _unpack_list(list_value):
-    list_node = list_value.node()
-    assert list_node.kind() == "prim::ListConstruct"
-    return list_node.inputs()
-
-
 def parse_args(*arg_descriptors):
     def decorator(fn):
         def wrapper(g, *args):
@@ -221,18 +215,13 @@ def reciprocal(g, self):
     return g.op("Div", _if_scalar_type_as(g, torch.ones(1), self), self)
 
 
-@parse_args('v', 'i')
-def cat(g, tensor_list, dim):
-    tensors = _unpack_list(tensor_list)
+# This syntax is Python 2 portable
+def cat(g, *args):
+    dim = _get_const(args[-1], 'i', 'dim')
+    tensors = args[:-1]
     return g.op("Concat", *tensors, axis_i=dim)
 
 
-@parse_args('v', 'i')
-def stack(g, tensor_list, dim):
-    unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in _unpack_list(tensor_list)]
-    return g.op("Concat", *unsqueezed, axis_i=dim)
-
-
 def mm(g, self, other):
     # Create a dummy C tensor. Only needed for API purposes, the value is
     # since beta = 0
@@ -360,6 +349,11 @@ def view(g, self, size):
     return g.op("Reshape", self, shape)
 
 
+def stack(g, *args):
+    unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in args[:-1]] + [args[-1]]
+    return concat(g, *unsqueezed)
+
+
 @parse_args('v', 'i', 'i')
 def split(g, self, split_size, dim):
     size = self.type().sizes()[dim]
@@ -561,10 +555,9 @@ def replication_pad(g, input, padding):
 
 @parse_args('v', 'is')
 def upsample_nearest2d(g, input, output_size):
-    height_scale = float(output_size[-2]) / input.type().sizes()[-2]
-    width_scale = float(output_size[-1]) / input.type().sizes()[-1]
     return g.op("Upsample", input,
-                scales_f=[1., 1., height_scale, width_scale],
+                height_scale_f=float(output_size[-2]) / input.type().sizes()[-2],
+                width_scale_f=float(output_size[-1]) / input.type().sizes()[-1],
                 mode_s="nearest")
 
 
@@ -572,11 +565,10 @@ def upsample_nearest2d(g, input, output_size):
 def upsample_bilinear2d(g, input, output_size, align_corners):
     if align_corners:
         return _unimplemented("upsample_bilinear2d", "align_corners == True")
-    height_scale = float(output_size[-2]) / input.type().sizes()[-2]
-    width_scale = float(output_size[-1]) / input.type().sizes()[-1]
-    return g.op("Upsample", input,
-                scales_f=[1., 1., height_scale, width_scale],
-                mode_s="bilinear")
+    w_scale = float(output_size[-1]) / input.type().sizes()[-1]
+    h_scale = float(output_size[-2]) / input.type().sizes()[-2]
+    return g.op("Upsample", input, width_scale_f=w_scale,
+                height_scale_f=h_scale, mode_s="bilinear")
 
 
 def gt(g, input, other):
@@ -667,12 +659,10 @@ def unfold(g, input, dimension, size, step):
     return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step)
 
 
-@parse_args('v', 't', 't', 't')
-def elu(g, input, alpha, scale, input_scale):
+@parse_args('v', 't', 't')
+def elu(g, input, alpha, scale):
     if scale and scale != 1.:
         return _unimplemented("scale", "does not support scale in Elu")
-    if input_scale and input_scale != 1.:
-        return _unimplemented("input_scale", "does not support input_scale in Elu")
     # See Note [Export inplace]
     return g.op("Elu", input, alpha_f=_scalar(alpha))
 
@@ -686,10 +676,8 @@ def index_select(g, self, dim, index):
     return g.op("Gather", self, index, axis_i=dim)
 
 
-def index_put(g, self, indices_list_value, values):
-    indices_list = list(_unpack_list(indices_list_value))
-    args = [self] + indices_list + [values]
-    return g.op("ATen", *args, operator_s='index_put')
+def index_put(g, *inputs):
+    return g.op("ATen", *inputs, operator_s='index_put')
 
 
 def type_as(g, self, other):
@@ -880,17 +868,14 @@ def topk(g, self, k, dim, largest, sorted, out=None):
     return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2)
 
 
+@parse_args('v', 'is')
 def repeat(g, self, repeats):
-    if not _is_value(repeats):
-        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
-    const_repeats = _maybe_get_const(repeats, 'is')
-
-    if self.isTensor() and not _is_value(const_repeats):
+    if self.isTensor():
         sizes = self.type().sizes()
-        diff_dims = len(const_repeats) - len(sizes)
+        diff_dims = len(repeats) - len(sizes)
         if diff_dims > 0:
             self = view(g, self, [1] * diff_dims + sizes)
-    return g.op("Tile", self, repeats)
+    return g.op("Tile", self, g.op("Constant", value_t=torch.LongTensor(repeats)))
 
 
 def instance_norm(g, input, **kwargs):
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index b770b900c4edd3..4f9299d258ea3e 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -480,14 +480,8 @@ def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExpor
                     raise RuntimeError("Unsupported prim::Constant kind: `{}`. Send a bug report.".format(
                         n.kindOf("value")))
             elif op_name == "ListConstruct":
-                t = n.output().type()
-                # Tensor lists are used mostly for inputs to cat/stack. They need to be handled
-                # in those symbolics, and should become dead afterwards.
-                if t == torch._C.ListType.ofTensors():
-                    return None
-                elif t == torch._C.ListType.ofInts():
-                    unsqueezed = [g.op("Unsqueeze", input, axes_i=[0]) for input in inputs]
-                    return g.op("Concat", *unsqueezed, axis_i=0)
+                unsqueezed = [g.op("Unsqueeze", input, axes_i=[0]) for input in inputs]
+                return g.op("Concat", *unsqueezed, axis_i=0)
             elif op_name == "Undefined":
                 # Undefined is not an ONNX operator; keep it as prim::Undefined
                 # and let the exporter handle finally eliminating these
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 96cfaff8684cf0..ad7f780719ccd3 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -1,4 +1,3 @@
-import types
 import math
 import torch
 from torch._six import inf
@@ -87,37 +86,6 @@ def __init__(self, optimizer, lr_lambda, last_epoch=-1):
         self.last_epoch = last_epoch
         super(LambdaLR, self).__init__(optimizer, last_epoch)
 
-    def state_dict(self):
-        """Returns the state of the scheduler as a :class:`dict`.
-
-        It contains an entry for every variable in self.__dict__ which
-        is not the optimizer.
-        The learning rate lambda functions will only be saved if they are callable objects
-        and not if they are functions or lambdas.
-        """
-        state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', 'lr_lambdas')}
-        state_dict['lr_lambdas'] = [None] * len(self.lr_lambdas)
-
-        for idx, fn in enumerate(self.lr_lambdas):
-            if not isinstance(fn, types.FunctionType):
-                state_dict['lr_lambdas'][idx] = fn.__dict__.copy()
-
-        return state_dict
-
-    def load_state_dict(self, state_dict):
-        """Loads the schedulers state.
-
-        Arguments:
-            state_dict (dict): scheduler state. Should be an object returned
-                from a call to :meth:`state_dict`.
-        """
-        lr_lambdas = state_dict.pop('lr_lambdas')
-        self.__dict__.update(state_dict)
-
-        for idx, fn in enumerate(lr_lambdas):
-            if fn is not None:
-                self.lr_lambdas[idx].__dict__.update(fn)
-
     def get_lr(self):
         return [base_lr * lmbda(self.last_epoch)
                 for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]
diff --git a/torch/tensor.py b/torch/tensor.py
index 9784fd59c9d2fb..6b587fcf903586 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -384,8 +384,6 @@ def __dir__(self):
         return sorted(keys)
 
     # Numpy array interface, to support `numpy.asarray(tensor) -> ndarray`
-    __array_priority__ = 1000    # prefer Tensor ops over numpy ones
-
     def __array__(self, dtype=None):
         if dtype is None:
             return self.cpu().numpy()